Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 88a8b59..2b36f05 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config NVME_CORE
tristate
@@ -57,3 +58,19 @@
from https://github.com/linux-nvme/nvme-cli.
If unsure, say N.
+
+config NVME_TCP
+ tristate "NVM Express over Fabrics TCP host driver"
+ depends on INET
+ depends on BLK_DEV_NVME
+ select NVME_FABRICS
+ select CRYPTO_CRC32C
+ help
+ This provides support for the NVMe over Fabrics protocol using
+ the TCP transport. This allows you to use remote block devices
+ exported using the NVMe protocol set.
+
+ To configure a NVMe over Fabrics controller use the nvme-cli tool
+ from https://github.com/linux-nvme/nvme-cli.
+
+ If unsure, say N.
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index aea459c..8a4b671 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -7,6 +7,7 @@
obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o
obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o
obj-$(CONFIG_NVME_FC) += nvme-fc.o
+obj-$(CONFIG_NVME_TCP) += nvme-tcp.o
nvme-core-y := core.o
nvme-core-$(CONFIG_TRACING) += trace.o
@@ -21,3 +22,5 @@
nvme-rdma-y += rdma.o
nvme-fc-y += fc.o
+
+nvme-tcp-y += tcp.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e5bddae..fa7ba09 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* NVM Express device driver
* Copyright (c) 2011-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
*/
#include <linux/blkdev.h>
@@ -19,6 +11,7 @@
#include <linux/hdreg.h>
#include <linux/kernel.h>
#include <linux/module.h>
+#include <linux/backing-dev.h>
#include <linux/list_sort.h>
#include <linux/slab.h>
#include <linux/types.h>
@@ -29,12 +22,12 @@
#include <linux/pm_qos.h>
#include <asm/unaligned.h>
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
#include "nvme.h"
#include "fabrics.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
#define NVME_MINORS (1U << MINORBITS)
unsigned int admin_timeout = 60;
@@ -88,7 +81,6 @@
struct workqueue_struct *nvme_delete_wq;
EXPORT_SYMBOL_GPL(nvme_delete_wq);
-static DEFINE_IDA(nvme_subsystems_ida);
static LIST_HEAD(nvme_subsystems);
static DEFINE_MUTEX(nvme_subsystems_lock);
@@ -97,7 +89,6 @@
static struct class *nvme_class;
static struct class *nvme_subsys_class;
-static void nvme_ns_remove(struct nvme_ns *ns);
static int nvme_revalidate_disk(struct gendisk *disk);
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
@@ -111,10 +102,13 @@
*/
if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
return;
- revalidate_disk(ns->disk);
blk_set_queue_dying(ns->queue);
/* Forcibly unquiesce queues to avoid blocking dispatch */
blk_mq_unquiesce_queue(ns->queue);
+ /*
+ * Revalidate after unblocking dispatchers that may be holding bd_butex
+ */
+ revalidate_disk(ns->disk);
}
static void nvme_queue_scan(struct nvme_ctrl *ctrl)
@@ -122,10 +116,26 @@
/*
* Only new queue scan work when admin and IO queues are both alive
*/
- if (ctrl->state == NVME_CTRL_LIVE)
+ if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
queue_work(nvme_wq, &ctrl->scan_work);
}
+/*
+ * Use this function to proceed with scheduling reset_work for a controller
+ * that had previously been set to the resetting state. This is intended for
+ * code paths that can't be interrupted by other reset attempts. A hot removal
+ * may prevent this from succeeding.
+ */
+int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
+{
+ if (ctrl->state != NVME_CTRL_RESETTING)
+ return -EBUSY;
+ if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
+ return -EBUSY;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_try_sched_reset);
+
int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
@@ -143,8 +153,7 @@
ret = nvme_reset_ctrl(ctrl);
if (!ret) {
flush_work(&ctrl->reset_work);
- if (ctrl->state != NVME_CTRL_LIVE &&
- ctrl->state != NVME_CTRL_ADMIN_ONLY)
+ if (ctrl->state != NVME_CTRL_LIVE)
ret = -ENETRESET;
}
@@ -152,11 +161,8 @@
}
EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
-static void nvme_delete_ctrl_work(struct work_struct *work)
+static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
{
- struct nvme_ctrl *ctrl =
- container_of(work, struct nvme_ctrl, delete_work);
-
dev_info(ctrl->device,
"Removing ctrl: NQN \"%s\"\n", ctrl->opts->subsysnqn);
@@ -168,6 +174,14 @@
nvme_put_ctrl(ctrl);
}
+static void nvme_delete_ctrl_work(struct work_struct *work)
+{
+ struct nvme_ctrl *ctrl =
+ container_of(work, struct nvme_ctrl, delete_work);
+
+ nvme_do_delete_ctrl(ctrl);
+}
+
int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
@@ -178,31 +192,31 @@
}
EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
-int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
+static int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
{
int ret = 0;
/*
- * Keep a reference until the work is flushed since ->delete_ctrl
- * can free the controller.
+ * Keep a reference until nvme_do_delete_ctrl() complete,
+ * since ->delete_ctrl can free the controller.
*/
nvme_get_ctrl(ctrl);
- ret = nvme_delete_ctrl(ctrl);
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
+ ret = -EBUSY;
if (!ret)
- flush_work(&ctrl->delete_work);
+ nvme_do_delete_ctrl(ctrl);
nvme_put_ctrl(ctrl);
return ret;
}
-EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
{
return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
}
-static blk_status_t nvme_error_status(struct request *req)
+static blk_status_t nvme_error_status(u16 status)
{
- switch (nvme_req(req)->status & 0x7ff) {
+ switch (status & 0x7ff) {
case NVME_SC_SUCCESS:
return BLK_STS_OK;
case NVME_SC_CAP_EXCEEDED:
@@ -229,6 +243,8 @@
return BLK_STS_PROTECTION;
case NVME_SC_RESERVATION_CONFLICT:
return BLK_STS_NEXUS;
+ case NVME_SC_HOST_PATH_ERROR:
+ return BLK_STS_TRANSPORT;
default:
return BLK_STS_IOERR;
}
@@ -245,12 +261,31 @@
return true;
}
+static void nvme_retry_req(struct request *req)
+{
+ struct nvme_ns *ns = req->q->queuedata;
+ unsigned long delay = 0;
+ u16 crd;
+
+ /* The mask and shift result must be <= 3 */
+ crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
+ if (ns && crd)
+ delay = ns->ctrl->crdt[crd - 1] * 100;
+
+ nvme_req(req)->retries++;
+ blk_mq_requeue_request(req, false);
+ blk_mq_delay_kick_requeue_list(req->q, delay);
+}
+
void nvme_complete_rq(struct request *req)
{
- blk_status_t status = nvme_error_status(req);
+ blk_status_t status = nvme_error_status(nvme_req(req)->status);
trace_nvme_complete_rq(req);
+ if (nvme_req(req)->ctrl->kas)
+ nvme_req(req)->ctrl->comp_seen = true;
+
if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
if ((req->cmd_flags & REQ_NVME_MPATH) &&
blk_path_error(status)) {
@@ -259,23 +294,28 @@
}
if (!blk_queue_dying(req->q)) {
- nvme_req(req)->retries++;
- blk_mq_requeue_request(req, true);
+ nvme_retry_req(req);
return;
}
}
+
+ nvme_trace_bio_complete(req, status);
blk_mq_end_request(req, status);
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);
-void nvme_cancel_request(struct request *req, void *data, bool reserved)
+bool nvme_cancel_request(struct request *req, void *data, bool reserved)
{
dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
"Cancelling I/O %d", req->tag);
- nvme_req(req)->status = NVME_SC_ABORT_REQ;
- blk_mq_complete_request(req);
+ /* don't abort one completed request */
+ if (blk_mq_request_completed(req))
+ return true;
+ nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
+ blk_mq_complete_request(req);
+ return true;
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
@@ -290,15 +330,6 @@
old_state = ctrl->state;
switch (new_state) {
- case NVME_CTRL_ADMIN_ONLY:
- switch (old_state) {
- case NVME_CTRL_CONNECTING:
- changed = true;
- /* FALLTHRU */
- default:
- break;
- }
- break;
case NVME_CTRL_LIVE:
switch (old_state) {
case NVME_CTRL_NEW:
@@ -314,7 +345,6 @@
switch (old_state) {
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
- case NVME_CTRL_ADMIN_ONLY:
changed = true;
/* FALLTHRU */
default:
@@ -334,7 +364,6 @@
case NVME_CTRL_DELETING:
switch (old_state) {
case NVME_CTRL_LIVE:
- case NVME_CTRL_ADMIN_ONLY:
case NVME_CTRL_RESETTING:
case NVME_CTRL_CONNECTING:
changed = true;
@@ -356,8 +385,10 @@
break;
}
- if (changed)
+ if (changed) {
ctrl->state = new_state;
+ wake_up_all(&ctrl->state_wq);
+ }
spin_unlock_irqrestore(&ctrl->lock, flags);
if (changed && ctrl->state == NVME_CTRL_LIVE)
@@ -366,6 +397,39 @@
}
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
+/*
+ * Returns true for sink states that can't ever transition back to live.
+ */
+static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
+{
+ switch (ctrl->state) {
+ case NVME_CTRL_NEW:
+ case NVME_CTRL_LIVE:
+ case NVME_CTRL_RESETTING:
+ case NVME_CTRL_CONNECTING:
+ return false;
+ case NVME_CTRL_DELETING:
+ case NVME_CTRL_DEAD:
+ return true;
+ default:
+ WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
+ return true;
+ }
+}
+
+/*
+ * Waits for the controller state to be resetting, or returns false if it is
+ * not possible to ever transition to that state.
+ */
+bool nvme_wait_reset(struct nvme_ctrl *ctrl)
+{
+ wait_event(ctrl->state_wq,
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
+ nvme_state_terminal(ctrl));
+ return ctrl->state == NVME_CTRL_RESETTING;
+}
+EXPORT_SYMBOL_GPL(nvme_wait_reset);
+
static void nvme_free_ns_head(struct kref *ref)
{
struct nvme_ns_head *head =
@@ -374,7 +438,7 @@
nvme_mpath_remove_disk(head);
ida_simple_remove(&head->subsys->ns_ida, head->instance);
list_del_init(&head->entry);
- cleanup_srcu_struct_quiesced(&head->srcu);
+ cleanup_srcu_struct(&head->srcu);
nvme_put_subsystem(head->subsys);
kfree(head);
}
@@ -536,7 +600,6 @@
static inline void nvme_setup_flush(struct nvme_ns *ns,
struct nvme_command *cmnd)
{
- memset(cmnd, 0, sizeof(*cmnd));
cmnd->common.opcode = nvme_cmd_flush;
cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
}
@@ -548,9 +611,19 @@
struct nvme_dsm_range *range;
struct bio *bio;
- range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
- if (!range)
- return BLK_STS_RESOURCE;
+ range = kmalloc_array(segments, sizeof(*range),
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!range) {
+ /*
+ * If we fail allocation our range, fallback to the controller
+ * discard page. If that's also busy, it's safe to return
+ * busy, as we know we can make progress once that's freed.
+ */
+ if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
+ return BLK_STS_RESOURCE;
+
+ range = page_address(ns->ctrl->discard_page);
+ }
__rq_for_each_bio(bio, req) {
u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
@@ -565,11 +638,13 @@
}
if (WARN_ON_ONCE(n != segments)) {
- kfree(range);
+ if (virt_to_page(range) == ns->ctrl->discard_page)
+ clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+ else
+ kfree(range);
return BLK_STS_IOERR;
}
- memset(cmnd, 0, sizeof(*cmnd));
cmnd->dsm.opcode = nvme_cmd_dsm;
cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->dsm.nr = cpu_to_le32(segments - 1);
@@ -583,6 +658,22 @@
return BLK_STS_OK;
}
+static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
+ struct request *req, struct nvme_command *cmnd)
+{
+ if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
+ return nvme_setup_discard(ns, req, cmnd);
+
+ cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
+ cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
+ cmnd->write_zeroes.slba =
+ cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+ cmnd->write_zeroes.length =
+ cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+ cmnd->write_zeroes.control = 0;
+ return BLK_STS_OK;
+}
+
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
struct request *req, struct nvme_command *cmnd)
{
@@ -598,7 +689,6 @@
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
- memset(cmnd, 0, sizeof(*cmnd));
cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
@@ -618,8 +708,6 @@
if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
return BLK_STS_NOTSUPP;
control |= NVME_RW_PRINFO_PRACT;
- } else if (req_op(req) == REQ_OP_WRITE) {
- t10_pi_prepare(req, ns->pi_type);
}
switch (ns->pi_type) {
@@ -642,16 +730,14 @@
void nvme_cleanup_cmd(struct request *req)
{
- if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
- nvme_req(req)->status == 0) {
- struct nvme_ns *ns = req->rq_disk->private_data;
-
- t10_pi_complete(req, ns->pi_type,
- blk_rq_bytes(req) >> ns->lba_shift);
- }
if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
- kfree(page_address(req->special_vec.bv_page) +
- req->special_vec.bv_offset);
+ struct nvme_ns *ns = req->rq_disk->private_data;
+ struct page *page = req->special_vec.bv_page;
+
+ if (page == ns->ctrl->discard_page)
+ clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+ else
+ kfree(page_address(page) + req->special_vec.bv_offset);
}
}
EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
@@ -663,6 +749,7 @@
nvme_clear_nvme_request(req);
+ memset(cmd, 0, sizeof(*cmd));
switch (req_op(req)) {
case REQ_OP_DRV_IN:
case REQ_OP_DRV_OUT:
@@ -672,7 +759,8 @@
nvme_setup_flush(ns, cmd);
break;
case REQ_OP_WRITE_ZEROES:
- /* currently only aliased to deallocate for a few ctrls: */
+ ret = nvme_setup_write_zeroes(ns, req, cmd);
+ break;
case REQ_OP_DISCARD:
ret = nvme_setup_discard(ns, req, cmd);
break;
@@ -691,6 +779,31 @@
}
EXPORT_SYMBOL_GPL(nvme_setup_cmd);
+static void nvme_end_sync_rq(struct request *rq, blk_status_t error)
+{
+ struct completion *waiting = rq->end_io_data;
+
+ rq->end_io_data = NULL;
+ complete(waiting);
+}
+
+static void nvme_execute_rq_polled(struct request_queue *q,
+ struct gendisk *bd_disk, struct request *rq, int at_head)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ WARN_ON_ONCE(!test_bit(QUEUE_FLAG_POLL, &q->queue_flags));
+
+ rq->cmd_flags |= REQ_HIPRI;
+ rq->end_io_data = &wait;
+ blk_execute_rq_nowait(q, bd_disk, rq, at_head, nvme_end_sync_rq);
+
+ while (!completion_done(&wait)) {
+ blk_poll(q, request_to_qc_t(rq->mq_hctx, rq), true);
+ cond_resched();
+ }
+}
+
/*
* Returns 0 on success. If the result is negative, it's a Linux error code;
* if the result is positive, it's an NVM Express status code
@@ -698,7 +811,7 @@
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
unsigned timeout, int qid, int at_head,
- blk_mq_req_flags_t flags)
+ blk_mq_req_flags_t flags, bool poll)
{
struct request *req;
int ret;
@@ -715,7 +828,10 @@
goto out;
}
- blk_execute_rq(req->q, NULL, req, at_head);
+ if (poll)
+ nvme_execute_rq_polled(req->q, NULL, req, at_head);
+ else
+ blk_execute_rq(req->q, NULL, req, at_head);
if (result)
*result = nvme_req(req)->result;
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
@@ -732,7 +848,7 @@
void *buffer, unsigned bufflen)
{
return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0,
- NVME_QID_ANY, 0, 0);
+ NVME_QID_ANY, 0, 0, false);
}
EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd);
@@ -773,7 +889,7 @@
static int nvme_submit_user_cmd(struct request_queue *q,
struct nvme_command *cmd, void __user *ubuffer,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
- u32 meta_seed, u32 *result, unsigned timeout)
+ u32 meta_seed, u64 *result, unsigned timeout)
{
bool write = nvme_is_write(cmd);
struct nvme_ns *ns = q->queuedata;
@@ -814,7 +930,7 @@
else
ret = nvme_req(req)->status;
if (result)
- *result = le32_to_cpu(nvme_req(req)->result.u32);
+ *result = le64_to_cpu(nvme_req(req)->result.u64);
if (meta && !ret && !write) {
if (copy_to_user(meta_buffer, meta, meta_len))
ret = -EFAULT;
@@ -843,6 +959,7 @@
return;
}
+ ctrl->comp_seen = false;
spin_lock_irqsave(&ctrl->lock, flags);
if (ctrl->state == NVME_CTRL_LIVE ||
ctrl->state == NVME_CTRL_CONNECTING)
@@ -873,6 +990,15 @@
{
struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
struct nvme_ctrl, ka_work);
+ bool comp_seen = ctrl->comp_seen;
+
+ if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
+ dev_dbg(ctrl->device,
+ "reschedule traffic based keep-alive timer\n");
+ ctrl->comp_seen = false;
+ schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+ return;
+ }
if (nvme_keep_alive(ctrl)) {
/* allocation failure, reset the controller */
@@ -979,7 +1105,7 @@
uuid_copy(&ids->uuid, data + pos + sizeof(*cur));
break;
default:
- /* Skip unnkown types */
+ /* Skip unknown types */
len = cur->nidl;
break;
}
@@ -1002,10 +1128,9 @@
NVME_IDENTIFY_DATA_SIZE);
}
-static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
- unsigned nsid)
+static int nvme_identify_ns(struct nvme_ctrl *ctrl,
+ unsigned nsid, struct nvme_id_ns **id)
{
- struct nvme_id_ns *id;
struct nvme_command c = { };
int error;
@@ -1014,39 +1139,56 @@
c.identify.nsid = cpu_to_le32(nsid);
c.identify.cns = NVME_ID_CNS_NS;
- id = kmalloc(sizeof(*id), GFP_KERNEL);
- if (!id)
- return NULL;
+ *id = kmalloc(sizeof(**id), GFP_KERNEL);
+ if (!*id)
+ return -ENOMEM;
- error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id));
+ error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id));
if (error) {
- dev_warn(ctrl->device, "Identify namespace failed\n");
- kfree(id);
- return NULL;
+ dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
+ kfree(*id);
}
- return id;
+ return error;
}
-static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
- void *buffer, size_t buflen, u32 *result)
+static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
+ unsigned int dword11, void *buffer, size_t buflen, u32 *result)
{
struct nvme_command c;
union nvme_result res;
int ret;
memset(&c, 0, sizeof(c));
- c.features.opcode = nvme_admin_set_features;
+ c.features.opcode = op;
c.features.fid = cpu_to_le32(fid);
c.features.dword11 = cpu_to_le32(dword11);
ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
- buffer, buflen, 0, NVME_QID_ANY, 0, 0);
+ buffer, buflen, 0, NVME_QID_ANY, 0, 0, false);
if (ret >= 0 && result)
*result = le32_to_cpu(res.u32);
return ret;
}
+int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
+ unsigned int dword11, void *buffer, size_t buflen,
+ u32 *result)
+{
+ return nvme_features(dev, nvme_admin_set_features, fid, dword11, buffer,
+ buflen, result);
+}
+EXPORT_SYMBOL_GPL(nvme_set_features);
+
+int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
+ unsigned int dword11, void *buffer, size_t buflen,
+ u32 *result)
+{
+ return nvme_features(dev, nvme_admin_get_features, fid, dword11, buffer,
+ buflen, result);
+}
+EXPORT_SYMBOL_GPL(nvme_get_features);
+
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
{
u32 q_count = (*count - 1) | ((*count - 1) << 16);
@@ -1076,7 +1218,8 @@
EXPORT_SYMBOL_GPL(nvme_set_queue_count);
#define NVME_AEN_SUPPORTED \
- (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE)
+ (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \
+ NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE)
static void nvme_enable_aen(struct nvme_ctrl *ctrl)
{
@@ -1091,6 +1234,8 @@
if (status)
dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
supported_aens);
+
+ queue_work(nvme_wq, &ctrl->async_event_work);
}
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
@@ -1140,7 +1285,7 @@
return nvme_submit_user_cmd(ns->queue, &c,
(void __user *)(uintptr_t)io.addr, length,
- metadata, meta_len, io.slba, NULL, 0);
+ metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
}
static u32 nvme_known_admin_effects(u8 opcode)
@@ -1165,7 +1310,7 @@
if (ns) {
if (ctrl->effects)
effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
- if (effects & ~NVME_CMD_EFFECTS_CSUPP)
+ if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
dev_warn(ctrl->device,
"IO command:%02x has unhandled effects:%08x\n",
opcode, effects);
@@ -1174,14 +1319,17 @@
if (ctrl->effects)
effects = le32_to_cpu(ctrl->effects->acs[opcode]);
- else
- effects = nvme_known_admin_effects(opcode);
+ effects |= nvme_known_admin_effects(opcode);
/*
* For simplicity, IO to all namespaces is quiesced even if the command
* effects say only one namespace is affected.
*/
if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+ mutex_lock(&ctrl->scan_lock);
+ mutex_lock(&ctrl->subsys->lock);
+ nvme_mpath_start_freeze(ctrl->subsys);
+ nvme_mpath_wait_freeze(ctrl->subsys);
nvme_start_freeze(ctrl);
nvme_wait_freeze(ctrl);
}
@@ -1197,8 +1345,6 @@
if (ns->disk && nvme_revalidate_disk(ns->disk))
nvme_set_queue_dying(ns);
up_read(&ctrl->namespaces_rwsem);
-
- nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
}
static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
@@ -1210,8 +1356,13 @@
*/
if (effects & NVME_CMD_EFFECTS_LBCC)
nvme_update_formats(ctrl);
- if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK))
+ if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
nvme_unfreeze(ctrl);
+ nvme_mpath_unfreeze(ctrl->subsys);
+ mutex_unlock(&ctrl->subsys->lock);
+ nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
+ mutex_unlock(&ctrl->scan_lock);
+ }
if (effects & NVME_CMD_EFFECTS_CCC)
nvme_init_identify(ctrl);
if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
@@ -1225,6 +1376,54 @@
struct nvme_command c;
unsigned timeout = 0;
u32 effects;
+ u64 result;
+ int status;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
+ return -EFAULT;
+ if (cmd.flags)
+ return -EINVAL;
+
+ memset(&c, 0, sizeof(c));
+ c.common.opcode = cmd.opcode;
+ c.common.flags = cmd.flags;
+ c.common.nsid = cpu_to_le32(cmd.nsid);
+ c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+ c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+ c.common.cdw10 = cpu_to_le32(cmd.cdw10);
+ c.common.cdw11 = cpu_to_le32(cmd.cdw11);
+ c.common.cdw12 = cpu_to_le32(cmd.cdw12);
+ c.common.cdw13 = cpu_to_le32(cmd.cdw13);
+ c.common.cdw14 = cpu_to_le32(cmd.cdw14);
+ c.common.cdw15 = cpu_to_le32(cmd.cdw15);
+
+ if (cmd.timeout_ms)
+ timeout = msecs_to_jiffies(cmd.timeout_ms);
+
+ effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
+ status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
+ (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
+ (void __user *)(uintptr_t)cmd.metadata,
+ cmd.metadata_len, 0, &result, timeout);
+ nvme_passthru_end(ctrl, effects);
+
+ if (status >= 0) {
+ if (put_user(result, &ucmd->result))
+ return -EFAULT;
+ }
+
+ return status;
+}
+
+static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+ struct nvme_passthru_cmd64 __user *ucmd)
+{
+ struct nvme_passthru_cmd64 cmd;
+ struct nvme_command c;
+ unsigned timeout = 0;
+ u32 effects;
int status;
if (!capable(CAP_SYS_ADMIN))
@@ -1240,12 +1439,12 @@
c.common.nsid = cpu_to_le32(cmd.nsid);
c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
- c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
- c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
- c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
- c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
- c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
- c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+ c.common.cdw10 = cpu_to_le32(cmd.cdw10);
+ c.common.cdw11 = cpu_to_le32(cmd.cdw11);
+ c.common.cdw12 = cpu_to_le32(cmd.cdw12);
+ c.common.cdw13 = cpu_to_le32(cmd.cdw13);
+ c.common.cdw14 = cpu_to_le32(cmd.cdw14);
+ c.common.cdw15 = cpu_to_le32(cmd.cdw15);
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
@@ -1274,9 +1473,14 @@
{
#ifdef CONFIG_NVME_MULTIPATH
if (disk->fops == &nvme_ns_head_ops) {
+ struct nvme_ns *ns;
+
*head = disk->private_data;
*srcu_idx = srcu_read_lock(&(*head)->srcu);
- return nvme_find_path(*head);
+ ns = nvme_find_path(*head);
+ if (!ns)
+ srcu_read_unlock(&(*head)->srcu, *srcu_idx);
+ return ns;
}
#endif
*head = NULL;
@@ -1290,42 +1494,82 @@
srcu_read_unlock(&head->srcu, idx);
}
-static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg)
+static bool is_ctrl_ioctl(unsigned int cmd)
{
+ if (cmd == NVME_IOCTL_ADMIN_CMD || cmd == NVME_IOCTL_ADMIN64_CMD)
+ return true;
+ if (is_sed_ioctl(cmd))
+ return true;
+ return false;
+}
+
+static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
+ void __user *argp,
+ struct nvme_ns_head *head,
+ int srcu_idx)
+{
+ struct nvme_ctrl *ctrl = ns->ctrl;
+ int ret;
+
+ nvme_get_ctrl(ns->ctrl);
+ nvme_put_ns_from_disk(head, srcu_idx);
+
switch (cmd) {
- case NVME_IOCTL_ID:
- force_successful_syscall_return();
- return ns->head->ns_id;
case NVME_IOCTL_ADMIN_CMD:
- return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
- case NVME_IOCTL_IO_CMD:
- return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
- case NVME_IOCTL_SUBMIT_IO:
- return nvme_submit_io(ns, (void __user *)arg);
+ ret = nvme_user_cmd(ctrl, NULL, argp);
+ break;
+ case NVME_IOCTL_ADMIN64_CMD:
+ ret = nvme_user_cmd64(ctrl, NULL, argp);
+ break;
default:
-#ifdef CONFIG_NVM
- if (ns->ndev)
- return nvme_nvm_ioctl(ns, cmd, arg);
-#endif
- if (is_sed_ioctl(cmd))
- return sed_ioctl(ns->ctrl->opal_dev, cmd,
- (void __user *) arg);
- return -ENOTTY;
+ ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
+ break;
}
+ nvme_put_ctrl(ctrl);
+ return ret;
}
static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct nvme_ns_head *head = NULL;
+ void __user *argp = (void __user *)arg;
struct nvme_ns *ns;
int srcu_idx, ret;
ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
if (unlikely(!ns))
- ret = -EWOULDBLOCK;
- else
- ret = nvme_ns_ioctl(ns, cmd, arg);
+ return -EWOULDBLOCK;
+
+ /*
+ * Handle ioctls that apply to the controller instead of the namespace
+ * seperately and drop the ns SRCU reference early. This avoids a
+ * deadlock when deleting namespaces using the passthrough interface.
+ */
+ if (is_ctrl_ioctl(cmd))
+ return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
+
+ switch (cmd) {
+ case NVME_IOCTL_ID:
+ force_successful_syscall_return();
+ ret = ns->head->ns_id;
+ break;
+ case NVME_IOCTL_IO_CMD:
+ ret = nvme_user_cmd(ns->ctrl, ns, argp);
+ break;
+ case NVME_IOCTL_SUBMIT_IO:
+ ret = nvme_submit_io(ns, argp);
+ break;
+ case NVME_IOCTL_IO64_CMD:
+ ret = nvme_user_cmd64(ns->ctrl, ns, argp);
+ break;
+ default:
+ if (ns->ndev)
+ ret = nvme_nvm_ioctl(ns, cmd, arg);
+ else
+ ret = -ENOTTY;
+ }
+
nvme_put_ns_from_disk(head, srcu_idx);
return ret;
}
@@ -1407,10 +1651,10 @@
blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
}
-static void nvme_config_discard(struct nvme_ns *ns)
+static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
{
struct nvme_ctrl *ctrl = ns->ctrl;
- struct request_queue *queue = ns->queue;
+ struct request_queue *queue = disk->queue;
u32 size = queue_logical_block_size(queue);
if (!(ctrl->oncs & NVME_CTRL_ONCS_DSM)) {
@@ -1438,9 +1682,37 @@
blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
}
-static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
+static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns)
+{
+ u32 max_sectors;
+ unsigned short bs = 1 << ns->lba_shift;
+
+ if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) ||
+ (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
+ return;
+ /*
+ * Even though NVMe spec explicitly states that MDTS is not
+ * applicable to the write-zeroes:- "The restriction does not apply to
+ * commands that do not transfer data between the host and the
+ * controller (e.g., Write Uncorrectable ro Write Zeroes command).".
+ * In order to be more cautious use controller's max_hw_sectors value
+ * to configure the maximum sectors for the write-zeroes which is
+ * configured based on the controller's MDTS field in the
+ * nvme_init_identify() if available.
+ */
+ if (ns->ctrl->max_hw_sectors == UINT_MAX)
+ max_sectors = ((u32)(USHRT_MAX + 1) * bs) >> 9;
+ else
+ max_sectors = ((u32)(ns->ctrl->max_hw_sectors + 1) * bs) >> 9;
+
+ blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors);
+}
+
+static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
struct nvme_id_ns *id, struct nvme_ns_ids *ids)
{
+ int ret = 0;
+
memset(ids, 0, sizeof(*ids));
if (ctrl->vs >= NVME_VS(1, 1, 0))
@@ -1451,10 +1723,12 @@
/* Don't treat error as fatal we potentially
* already have a NGUID or EUI-64
*/
- if (nvme_identify_ns_descs(ctrl, nsid, ids))
+ ret = nvme_identify_ns_descs(ctrl, nsid, ids);
+ if (ret)
dev_warn(ctrl->device,
- "%s: Identify Descriptors failed\n", __func__);
+ "Identify Descriptors failed (%d)\n", ret);
}
+ return ret;
}
static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
@@ -1474,24 +1748,60 @@
static void nvme_update_disk_info(struct gendisk *disk,
struct nvme_ns *ns, struct nvme_id_ns *id)
{
- sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
+ sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9);
unsigned short bs = 1 << ns->lba_shift;
+ u32 atomic_bs, phys_bs, io_opt;
+ if (ns->lba_shift > PAGE_SHIFT) {
+ /* unsupported block size, set capacity to 0 later */
+ bs = (1 << 9);
+ }
blk_mq_freeze_queue(disk->queue);
blk_integrity_unregister(disk);
+ if (id->nabo == 0) {
+ /*
+ * Bit 1 indicates whether NAWUPF is defined for this namespace
+ * and whether it should be used instead of AWUPF. If NAWUPF ==
+ * 0 then AWUPF must be used instead.
+ */
+ if (id->nsfeat & (1 << 1) && id->nawupf)
+ atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
+ else
+ atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
+ } else {
+ atomic_bs = bs;
+ }
+ phys_bs = bs;
+ io_opt = bs;
+ if (id->nsfeat & (1 << 4)) {
+ /* NPWG = Namespace Preferred Write Granularity */
+ phys_bs *= 1 + le16_to_cpu(id->npwg);
+ /* NOWS = Namespace Optimal Write Size */
+ io_opt *= 1 + le16_to_cpu(id->nows);
+ }
+
blk_queue_logical_block_size(disk->queue, bs);
- blk_queue_physical_block_size(disk->queue, bs);
- blk_queue_io_min(disk->queue, bs);
+ /*
+ * Linux filesystems assume writing a single physical block is
+ * an atomic operation. Hence limit the physical block size to the
+ * value of the Atomic Write Unit Power Fail parameter.
+ */
+ blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
+ blk_queue_io_min(disk->queue, phys_bs);
+ blk_queue_io_opt(disk->queue, io_opt);
if (ns->ms && !ns->ext &&
(ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
nvme_init_integrity(disk, ns->ms, ns->pi_type);
- if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk))
+ if ((ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk)) ||
+ ns->lba_shift > PAGE_SHIFT)
capacity = 0;
set_capacity(disk, capacity);
- nvme_config_discard(ns);
+
+ nvme_config_discard(disk, ns);
+ nvme_config_write_zeroes(disk, ns);
if (id->nsattr & (1 << 0))
set_disk_ro(disk, true);
@@ -1524,12 +1834,11 @@
if (ns->noiob)
nvme_set_chunk_size(ns);
nvme_update_disk_info(disk, ns, id);
- if (ns->ndev)
- nvme_nvm_update_nvm_info(ns);
#ifdef CONFIG_NVME_MULTIPATH
if (ns->head->disk) {
nvme_update_disk_info(ns->head->disk, ns, id);
blk_queue_stack_limits(ns->head->disk->queue, ns->queue);
+ revalidate_disk(ns->head->disk);
}
#endif
}
@@ -1547,25 +1856,37 @@
return -ENODEV;
}
- id = nvme_identify_ns(ctrl, ns->head->ns_id);
- if (!id)
- return -ENODEV;
+ ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id);
+ if (ret)
+ goto out;
if (id->ncap == 0) {
ret = -ENODEV;
- goto out;
+ goto free_id;
}
__nvme_revalidate_disk(disk, id);
- nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
+ ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
+ if (ret)
+ goto free_id;
+
if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
dev_err(ctrl->device,
"identifiers changed for nsid %d\n", ns->head->ns_id);
ret = -ENODEV;
}
-out:
+free_id:
kfree(id);
+out:
+ /*
+ * Only fail the function if we got a fatal error back from the
+ * device, otherwise ignore the error and just move on.
+ */
+ if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR)))
+ ret = 0;
+ else if (ret > 0)
+ ret = blk_status_to_errno(nvme_error_status(ret));
return ret;
}
@@ -1608,7 +1929,7 @@
memset(&c, 0, sizeof(c));
c.common.opcode = op;
c.common.nsid = cpu_to_le32(ns->head->ns_id);
- c.common.cdw10[0] = cpu_to_le32(cdw10);
+ c.common.cdw10 = cpu_to_le32(cdw10);
ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
nvme_put_ns_from_disk(head, srcu_idx);
@@ -1682,11 +2003,11 @@
else
cmd.common.opcode = nvme_admin_security_recv;
cmd.common.nsid = 0;
- cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
- cmd.common.cdw10[1] = cpu_to_le32(len);
+ cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
+ cmd.common.cdw11 = cpu_to_le32(len);
return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
- ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
+ ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0, false);
}
EXPORT_SYMBOL_GPL(nvme_sec_submit);
#endif /* CONFIG_BLK_SED_OPAL */
@@ -1761,7 +2082,7 @@
* bits', but doing so may cause the device to complete commands to the
* admin queue ... and we don't know what memory that might be pointing at!
*/
-int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
{
int ret;
@@ -1775,20 +2096,27 @@
if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
msleep(NVME_QUIRK_DELAY_AMOUNT);
- return nvme_wait_ready(ctrl, cap, false);
+ return nvme_wait_ready(ctrl, ctrl->cap, false);
}
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
-int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap)
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
{
/*
* Default to a 4K page size, with the intention to update this
* path in the future to accomodate architectures with differing
* kernel and IO page sizes.
*/
- unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12;
+ unsigned dev_page_min, page_shift = 12;
int ret;
+ ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
+ if (ret) {
+ dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
+ return ret;
+ }
+ dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12;
+
if (page_shift < dev_page_min) {
dev_err(ctrl->device,
"Minimum device page size %u too large for host (%u)\n",
@@ -1807,7 +2135,7 @@
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
- return nvme_wait_ready(ctrl, cap, true);
+ return nvme_wait_ready(ctrl, ctrl->cap, true);
}
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
@@ -1881,6 +2209,26 @@
return ret;
}
+static int nvme_configure_acre(struct nvme_ctrl *ctrl)
+{
+ struct nvme_feat_host_behavior *host;
+ int ret;
+
+ /* Don't bother enabling the feature if retry delay is not reported */
+ if (!ctrl->crdt[0])
+ return 0;
+
+ host = kzalloc(sizeof(*host), GFP_KERNEL);
+ if (!host)
+ return 0;
+
+ host->acre = NVME_ENABLE_ACRE;
+ ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
+ host, sizeof(*host), NULL);
+ kfree(host);
+ return ret;
+}
+
static int nvme_configure_apst(struct nvme_ctrl *ctrl)
{
/*
@@ -2046,6 +2394,26 @@
.vid = 0x1179,
.mn = "THNSF5256GPUK TOSHIBA",
.quirks = NVME_QUIRK_NO_APST,
+ },
+ {
+ /*
+ * This LiteON CL1-3D*-Q11 firmware version has a race
+ * condition associated with actions related to suspend to idle
+ * LiteON has resolved the problem in future firmware
+ */
+ .vid = 0x14a4,
+ .fr = "22301111",
+ .quirks = NVME_QUIRK_SIMPLE_SUSPEND,
+ },
+ {
+ /*
+ * This Kingston E8FK11.T firmware version has no interrupt
+ * after resume with actions related to suspend to idle
+ * https://bugzilla.kernel.org/show_bug.cgi?id=204887
+ */
+ .vid = 0x2646,
+ .fr = "E8FK11.T",
+ .quirks = NVME_QUIRK_SIMPLE_SUSPEND,
}
};
@@ -2084,18 +2452,20 @@
size_t nqnlen;
int off;
- nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
- if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
- strncpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
- return;
- }
+ if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
+ nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
+ if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
+ strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
+ return;
+ }
- if (ctrl->vs >= NVME_VS(1, 2, 1))
- dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
+ if (ctrl->vs >= NVME_VS(1, 2, 1))
+ dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
+ }
/* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */
off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
- "nqn.2014.08.org.nvmexpress:%4x%4x",
+ "nqn.2014.08.org.nvmexpress:%04x%04x",
le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn));
off += sizeof(id->sn);
@@ -2104,15 +2474,14 @@
memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off);
}
-static void __nvme_release_subsystem(struct nvme_subsystem *subsys)
-{
- ida_simple_remove(&nvme_subsystems_ida, subsys->instance);
- kfree(subsys);
-}
-
static void nvme_release_subsystem(struct device *dev)
{
- __nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev));
+ struct nvme_subsystem *subsys =
+ container_of(dev, struct nvme_subsystem, dev);
+
+ if (subsys->instance >= 0)
+ ida_simple_remove(&nvme_instance_ida, subsys->instance);
+ kfree(subsys);
}
static void nvme_destroy_subsystem(struct kref *ref)
@@ -2140,6 +2509,17 @@
lockdep_assert_held(&nvme_subsystems_lock);
+ /*
+ * Fail matches for discovery subsystems. This results
+ * in each discovery controller bound to a unique subsystem.
+ * This avoids issues with validating controller values
+ * that can only be true when there is a single unique subsystem.
+ * There may be multiple and completely independent entities
+ * that provide discovery controllers.
+ */
+ if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME))
+ return NULL;
+
list_for_each_entry(subsys, &nvme_subsystems, entry) {
if (strcmp(subsys->subnqn, subsysnqn))
continue;
@@ -2186,6 +2566,9 @@
&subsys_attr_serial.attr,
&subsys_attr_firmware_rev.attr,
&subsys_attr_subsysnqn.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+ &subsys_attr_iopolicy.attr,
+#endif
NULL,
};
@@ -2198,20 +2581,35 @@
NULL,
};
-static int nvme_active_ctrls(struct nvme_subsystem *subsys)
+static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
+ struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
- int count = 0;
- struct nvme_ctrl *ctrl;
+ struct nvme_ctrl *tmp;
- mutex_lock(&subsys->lock);
- list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
- if (ctrl->state != NVME_CTRL_DELETING &&
- ctrl->state != NVME_CTRL_DEAD)
- count++;
+ lockdep_assert_held(&nvme_subsystems_lock);
+
+ list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
+ if (tmp->state == NVME_CTRL_DELETING ||
+ tmp->state == NVME_CTRL_DEAD)
+ continue;
+
+ if (tmp->cntlid == ctrl->cntlid) {
+ dev_err(ctrl->device,
+ "Duplicate cntlid %u with %s, rejecting\n",
+ ctrl->cntlid, dev_name(tmp->device));
+ return false;
+ }
+
+ if ((id->cmic & (1 << 1)) ||
+ (ctrl->opts && ctrl->opts->discovery_nqn))
+ continue;
+
+ dev_err(ctrl->device,
+ "Subsystem does not support multiple controllers\n");
+ return false;
}
- mutex_unlock(&subsys->lock);
- return count;
+ return true;
}
static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
@@ -2222,12 +2620,8 @@
subsys = kzalloc(sizeof(*subsys), GFP_KERNEL);
if (!subsys)
return -ENOMEM;
- ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL);
- if (ret < 0) {
- kfree(subsys);
- return ret;
- }
- subsys->instance = ret;
+
+ subsys->instance = -1;
mutex_init(&subsys->lock);
kref_init(&subsys->ref);
INIT_LIST_HEAD(&subsys->ctrls);
@@ -2238,63 +2632,58 @@
memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
subsys->vendor_id = le16_to_cpu(id->vid);
subsys->cmic = id->cmic;
+ subsys->awupf = le16_to_cpu(id->awupf);
+#ifdef CONFIG_NVME_MULTIPATH
+ subsys->iopolicy = NVME_IOPOLICY_NUMA;
+#endif
subsys->dev.class = nvme_subsys_class;
subsys->dev.release = nvme_release_subsystem;
subsys->dev.groups = nvme_subsys_attrs_groups;
- dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance);
+ dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
device_initialize(&subsys->dev);
mutex_lock(&nvme_subsystems_lock);
found = __nvme_find_get_subsystem(subsys->subnqn);
if (found) {
- /*
- * Verify that the subsystem actually supports multiple
- * controllers, else bail out.
- */
- if (!(ctrl->opts && ctrl->opts->discovery_nqn) &&
- nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
- dev_err(ctrl->device,
- "ignoring ctrl due to duplicate subnqn (%s).\n",
- found->subnqn);
- nvme_put_subsystem(found);
- ret = -EINVAL;
- goto out_unlock;
- }
-
- __nvme_release_subsystem(subsys);
+ put_device(&subsys->dev);
subsys = found;
+
+ if (!nvme_validate_cntlid(subsys, ctrl, id)) {
+ ret = -EINVAL;
+ goto out_put_subsystem;
+ }
} else {
ret = device_add(&subsys->dev);
if (ret) {
dev_err(ctrl->device,
"failed to register subsystem device.\n");
+ put_device(&subsys->dev);
goto out_unlock;
}
ida_init(&subsys->ns_ida);
list_add_tail(&subsys->entry, &nvme_subsystems);
}
- ctrl->subsys = subsys;
- mutex_unlock(&nvme_subsystems_lock);
-
- if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
- dev_name(ctrl->device))) {
+ ret = sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj,
+ dev_name(ctrl->device));
+ if (ret) {
dev_err(ctrl->device,
"failed to create sysfs link from subsystem.\n");
- /* the transport driver will eventually put the subsystem */
- return -EINVAL;
+ goto out_put_subsystem;
}
- mutex_lock(&subsys->lock);
+ if (!found)
+ subsys->instance = ctrl->instance;
+ ctrl->subsys = subsys;
list_add_tail(&ctrl->subsys_entry, &subsys->ctrls);
- mutex_unlock(&subsys->lock);
-
+ mutex_unlock(&nvme_subsystems_lock);
return 0;
+out_put_subsystem:
+ nvme_put_subsystem(subsys);
out_unlock:
mutex_unlock(&nvme_subsystems_lock);
- put_device(&subsys->dev);
return ret;
}
@@ -2343,7 +2732,6 @@
int nvme_init_identify(struct nvme_ctrl *ctrl)
{
struct nvme_id_ctrl *id;
- u64 cap;
int ret, page_shift;
u32 max_hw_sectors;
bool prev_apst_enabled;
@@ -2353,16 +2741,11 @@
dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
return ret;
}
-
- ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
- if (ret) {
- dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
- return ret;
- }
- page_shift = NVME_CAP_MPSMIN(cap) + 12;
+ page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
+ ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
if (ctrl->vs >= NVME_VS(1, 1, 0))
- ctrl->subsystem = NVME_CAP_NSSRC(cap);
+ ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
ret = nvme_identify_ctrl(ctrl, &id);
if (ret) {
@@ -2376,6 +2759,9 @@
goto out_free;
}
+ if (!(ctrl->ops->flags & NVME_F_FABRICS))
+ ctrl->cntlid = le16_to_cpu(id->cntlid);
+
if (!ctrl->identified) {
int i;
@@ -2402,12 +2788,16 @@
ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
}
+ ctrl->crdt[0] = le16_to_cpu(id->crdt1);
+ ctrl->crdt[1] = le16_to_cpu(id->crdt2);
+ ctrl->crdt[2] = le16_to_cpu(id->crdt3);
+
ctrl->oacs = le16_to_cpu(id->oacs);
- ctrl->oncs = le16_to_cpup(&id->oncs);
+ ctrl->oncs = le16_to_cpu(id->oncs);
+ ctrl->mtfa = le16_to_cpu(id->mtfa);
ctrl->oaes = le32_to_cpu(id->oaes);
atomic_set(&ctrl->abort_limit, id->acl + 1);
ctrl->vwc = id->vwc;
- ctrl->cntlid = le16_to_cpup(&id->cntlid);
if (id->mdts)
max_hw_sectors = 1 << (id->mdts + page_shift - 9);
else
@@ -2419,6 +2809,7 @@
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
ctrl->max_namespaces = le32_to_cpu(id->mnan);
+ ctrl->ctratt = le32_to_cpu(id->ctratt);
if (id->rtd3e) {
/* us -> s */
@@ -2471,7 +2862,6 @@
goto out_free;
}
} else {
- ctrl->cntlid = le16_to_cpu(id->cntlid);
ctrl->hmpre = le32_to_cpu(id->hmpre);
ctrl->hmmin = le32_to_cpu(id->hmmin);
ctrl->hmminds = le32_to_cpu(id->hmminds);
@@ -2501,6 +2891,10 @@
if (ret < 0)
return ret;
+ ret = nvme_configure_acre(ctrl);
+ if (ret < 0)
+ return ret;
+
ctrl->identified = true;
return 0;
@@ -2518,7 +2912,6 @@
switch (ctrl->state) {
case NVME_CTRL_LIVE:
- case NVME_CTRL_ADMIN_ONLY:
break;
default:
return -EWOULDBLOCK;
@@ -2570,6 +2963,8 @@
switch (cmd) {
case NVME_IOCTL_ADMIN_CMD:
return nvme_user_cmd(ctrl, NULL, argp);
+ case NVME_IOCTL_ADMIN64_CMD:
+ return nvme_user_cmd64(ctrl, NULL, argp);
case NVME_IOCTL_IO_CMD:
return nvme_dev_user_cmd(ctrl, argp);
case NVME_IOCTL_RESET:
@@ -2739,11 +3134,19 @@
return a->mode;
}
-const struct attribute_group nvme_ns_id_attr_group = {
+static const struct attribute_group nvme_ns_id_attr_group = {
.attrs = nvme_ns_id_attrs,
.is_visible = nvme_ns_id_attrs_are_visible,
};
+const struct attribute_group *nvme_ns_id_attr_groups[] = {
+ &nvme_ns_id_attr_group,
+#ifdef CONFIG_NVM
+ &nvme_nvm_attr_group,
+#endif
+ NULL,
+};
+
#define nvme_show_str_function(field) \
static ssize_t field##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
@@ -2768,6 +3171,9 @@
static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
nvme_show_int_function(cntlid);
+nvme_show_int_function(numa_node);
+nvme_show_int_function(queue_count);
+nvme_show_int_function(sqsize);
static ssize_t nvme_sysfs_delete(struct device *dev,
struct device_attribute *attr, const char *buf,
@@ -2799,7 +3205,6 @@
static const char *const state_name[] = {
[NVME_CTRL_NEW] = "new",
[NVME_CTRL_LIVE] = "live",
- [NVME_CTRL_ADMIN_ONLY] = "only-admin",
[NVME_CTRL_RESETTING] = "resetting",
[NVME_CTRL_CONNECTING] = "connecting",
[NVME_CTRL_DELETING] = "deleting",
@@ -2847,6 +3252,9 @@
&dev_attr_subsysnqn.attr,
&dev_attr_address.attr,
&dev_attr_state.attr,
+ &dev_attr_numa_node.attr,
+ &dev_attr_queue_count.attr,
+ &dev_attr_sqsize.attr,
NULL
};
@@ -2910,9 +3318,14 @@
unsigned nsid, struct nvme_id_ns *id)
{
struct nvme_ns_head *head;
+ size_t size = sizeof(*head);
int ret = -ENOMEM;
- head = kzalloc(sizeof(*head), GFP_KERNEL);
+#ifdef CONFIG_NVME_MULTIPATH
+ size += num_possible_nodes() * sizeof(struct nvme_ns *);
+#endif
+
+ head = kzalloc(size, GFP_KERNEL);
if (!head)
goto out;
ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
@@ -2927,7 +3340,9 @@
head->ns_id = nsid;
kref_init(&head->ref);
- nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
+ ret = nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
+ if (ret)
+ goto out_cleanup_srcu;
ret = __nvme_check_ids(ctrl->subsys, head);
if (ret) {
@@ -2952,6 +3367,8 @@
out_free_head:
kfree(head);
out:
+ if (ret > 0)
+ ret = blk_status_to_errno(nvme_error_status(ret));
return ERR_PTR(ret);
}
@@ -2975,7 +3392,10 @@
} else {
struct nvme_ns_ids ids;
- nvme_report_ns_ids(ctrl, nsid, id, &ids);
+ ret = nvme_report_ns_ids(ctrl, nsid, id, &ids);
+ if (ret)
+ goto out_unlock;
+
if (!nvme_ns_ids_equal(&head->ids, &ids)) {
dev_err(ctrl->device,
"IDs don't match for shared namespace %d\n",
@@ -2990,6 +3410,8 @@
out_unlock:
mutex_unlock(&ctrl->subsys->lock);
+ if (ret > 0)
+ ret = blk_status_to_errno(nvme_error_status(ret));
return ret;
}
@@ -3046,22 +3468,32 @@
return 0;
}
-static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns;
struct gendisk *disk;
struct nvme_id_ns *id;
char disk_name[DISK_NAME_LEN];
- int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
+ int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT, ret;
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
- return;
+ return -ENOMEM;
ns->queue = blk_mq_init_queue(ctrl->tagset);
- if (IS_ERR(ns->queue))
+ if (IS_ERR(ns->queue)) {
+ ret = PTR_ERR(ns->queue);
goto out_free_ns;
+ }
+
+ if (ctrl->opts && ctrl->opts->data_digest)
+ ns->queue->backing_dev_info->capabilities
+ |= BDI_CAP_STABLE_WRITES;
+
blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
+ if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
+ blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
+
ns->queue->queuedata = ns;
ns->ctrl = ctrl;
@@ -3071,28 +3503,26 @@
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
nvme_set_queue_limits(ctrl, ns->queue);
- id = nvme_identify_ns(ctrl, nsid);
- if (!id)
+ ret = nvme_identify_ns(ctrl, nsid, &id);
+ if (ret)
goto out_free_queue;
- if (id->ncap == 0)
+ if (id->ncap == 0) {
+ ret = -EINVAL;
goto out_free_id;
+ }
- if (nvme_init_ns_head(ns, nsid, id))
+ ret = nvme_init_ns_head(ns, nsid, id);
+ if (ret)
goto out_free_id;
nvme_setup_streams_ns(ctrl, ns);
nvme_set_disk_name(disk_name, ns, ctrl, &flags);
- if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
- if (nvme_nvm_register(ns, disk_name, node)) {
- dev_warn(ctrl->device, "LightNVM init failure\n");
- goto out_unlink_ns;
- }
- }
-
disk = alloc_disk_node(0, node);
- if (!disk)
+ if (!disk) {
+ ret = -ENOMEM;
goto out_unlink_ns;
+ }
disk->fops = &nvme_fops;
disk->private_data = ns;
@@ -3103,36 +3533,43 @@
__nvme_revalidate_disk(disk, id);
+ if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
+ ret = nvme_nvm_register(ns, disk_name, node);
+ if (ret) {
+ dev_warn(ctrl->device, "LightNVM init failure\n");
+ goto out_put_disk;
+ }
+ }
+
down_write(&ctrl->namespaces_rwsem);
list_add_tail(&ns->list, &ctrl->namespaces);
up_write(&ctrl->namespaces_rwsem);
nvme_get_ctrl(ctrl);
- device_add_disk(ctrl->device, ns->disk);
- if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
- &nvme_ns_id_attr_group))
- pr_warn("%s: failed to create sysfs group for identification\n",
- ns->disk->disk_name);
- if (ns->ndev && nvme_nvm_register_sysfs(ns))
- pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
- ns->disk->disk_name);
+ device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups);
nvme_mpath_add_disk(ns, id);
- nvme_fault_inject_init(ns);
+ nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name);
kfree(id);
- return;
+ return 0;
+ out_put_disk:
+ put_disk(ns->disk);
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
mutex_unlock(&ctrl->subsys->lock);
+ nvme_put_ns_head(ns->head);
out_free_id:
kfree(id);
out_free_queue:
blk_cleanup_queue(ns->queue);
out_free_ns:
kfree(ns);
+ if (ret > 0)
+ ret = blk_status_to_errno(nvme_error_status(ret));
+ return ret;
}
static void nvme_ns_remove(struct nvme_ns *ns)
@@ -3140,28 +3577,26 @@
if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
return;
- nvme_fault_inject_fini(ns);
+ nvme_fault_inject_fini(&ns->fault_inject);
+
+ mutex_lock(&ns->ctrl->subsys->lock);
+ list_del_rcu(&ns->siblings);
+ mutex_unlock(&ns->ctrl->subsys->lock);
+ synchronize_rcu(); /* guarantee not available in head->list */
+ nvme_mpath_clear_current_path(ns);
+ synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
+
if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
- sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
- &nvme_ns_id_attr_group);
- if (ns->ndev)
- nvme_nvm_unregister_sysfs(ns);
del_gendisk(ns->disk);
blk_cleanup_queue(ns->queue);
if (blk_get_integrity(ns->disk))
blk_integrity_unregister(ns->disk);
}
- mutex_lock(&ns->ctrl->subsys->lock);
- list_del_rcu(&ns->siblings);
- nvme_mpath_clear_current_path(ns);
- mutex_unlock(&ns->ctrl->subsys->lock);
-
down_write(&ns->ctrl->namespaces_rwsem);
list_del_init(&ns->list);
up_write(&ns->ctrl->namespaces_rwsem);
- synchronize_srcu(&ns->head->srcu);
nvme_mpath_check_last_path(ns);
nvme_put_ns(ns);
}
@@ -3201,7 +3636,8 @@
{
struct nvme_ns *ns;
__le32 *ns_list;
- unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
+ unsigned i, j, nsid, prev = 0;
+ unsigned num_lists = DIV_ROUND_UP_ULL((u64)nn, 1024);
int ret = 0;
ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
@@ -3279,11 +3715,10 @@
struct nvme_id_ctrl *id;
unsigned nn;
- if (ctrl->state != NVME_CTRL_LIVE)
+ /* No tagset on a live ctrl means IO queues could not created */
+ if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
return;
- WARN_ON_ONCE(!ctrl->tagset);
-
if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
dev_info(ctrl->device, "rescanning namespaces.\n");
nvme_clear_changed_ns_log(ctrl);
@@ -3292,6 +3727,7 @@
if (nvme_identify_ctrl(ctrl, &id))
return;
+ mutex_lock(&ctrl->scan_lock);
nn = le32_to_cpu(id->nn);
if (ctrl->vs >= NVME_VS(1, 1, 0) &&
!(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
@@ -3300,6 +3736,7 @@
}
nvme_scan_ns_sequential(ctrl, nn);
out_free_id:
+ mutex_unlock(&ctrl->scan_lock);
kfree(id);
down_write(&ctrl->namespaces_rwsem);
list_sort(NULL, &ctrl->namespaces, ns_cmp);
@@ -3316,6 +3753,13 @@
struct nvme_ns *ns, *next;
LIST_HEAD(ns_list);
+ /*
+ * make sure to requeue I/O to all namespaces as these
+ * might result from the scan itself and must complete
+ * for the scan_work to make progress
+ */
+ nvme_mpath_clear_ctrl_paths(ctrl);
+
/* prevent racing with ns scanning */
flush_work(&ctrl->scan_work);
@@ -3337,6 +3781,33 @@
}
EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
+static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+ struct nvme_ctrl *ctrl =
+ container_of(dev, struct nvme_ctrl, ctrl_device);
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+ int ret;
+
+ ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name);
+ if (ret)
+ return ret;
+
+ if (opts) {
+ ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr);
+ if (ret)
+ return ret;
+
+ ret = add_uevent_var(env, "NVME_TRSVCID=%s",
+ opts->trsvcid ?: "none");
+ if (ret)
+ return ret;
+
+ ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s",
+ opts->host_traddr ?: "none");
+ }
+ return ret;
+}
+
static void nvme_aen_uevent(struct nvme_ctrl *ctrl)
{
char *envp[2] = { NULL, NULL };
@@ -3408,13 +3879,13 @@
if (time_after(jiffies, fw_act_timeout)) {
dev_warn(ctrl->device,
"Fw activation timeout, reset controller\n");
- nvme_reset_ctrl(ctrl);
- break;
+ nvme_try_sched_reset(ctrl);
+ return;
}
msleep(100);
}
- if (ctrl->state != NVME_CTRL_LIVE)
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
return;
nvme_start_queues(ctrl);
@@ -3424,13 +3895,23 @@
static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
{
- switch ((result & 0xff00) >> 8) {
+ u32 aer_notice_type = (result & 0xff00) >> 8;
+
+ trace_nvme_async_event(ctrl, aer_notice_type);
+
+ switch (aer_notice_type) {
case NVME_AER_NOTICE_NS_CHANGED:
set_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events);
nvme_queue_scan(ctrl);
break;
case NVME_AER_NOTICE_FW_ACT_STARTING:
- queue_work(nvme_wq, &ctrl->fw_act_work);
+ /*
+ * We are (ab)using the RESETTING state to prevent subsequent
+ * recovery actions from interfering with the controller's
+ * firmware activation.
+ */
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+ queue_work(nvme_wq, &ctrl->fw_act_work);
break;
#ifdef CONFIG_NVME_MULTIPATH
case NVME_AER_NOTICE_ANA:
@@ -3439,6 +3920,9 @@
queue_work(nvme_wq, &ctrl->ana_work);
break;
#endif
+ case NVME_AER_NOTICE_DISC_CHANGED:
+ ctrl->aen_result = result;
+ break;
default:
dev_warn(ctrl->device, "async event result %08x\n", result);
}
@@ -3448,11 +3932,12 @@
volatile union nvme_result *res)
{
u32 result = le32_to_cpu(res->u32);
+ u32 aer_type = result & 0x07;
if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS)
return;
- switch (result & 0x7) {
+ switch (aer_type) {
case NVME_AER_NOTICE:
nvme_handle_aen_notice(ctrl, result);
break;
@@ -3460,6 +3945,7 @@
case NVME_AER_SMART:
case NVME_AER_CSS:
case NVME_AER_VS:
+ trace_nvme_async_event(ctrl, aer_type);
ctrl->aen_result = result;
break;
default:
@@ -3475,8 +3961,6 @@
nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work);
cancel_work_sync(&ctrl->fw_act_work);
- if (ctrl->ops->stop_ctrl)
- ctrl->ops->stop_ctrl(ctrl);
}
EXPORT_SYMBOL_GPL(nvme_stop_ctrl);
@@ -3485,10 +3969,10 @@
if (ctrl->kato)
nvme_start_keep_alive(ctrl);
+ nvme_enable_aen(ctrl);
+
if (ctrl->queue_count > 1) {
nvme_queue_scan(ctrl);
- nvme_enable_aen(ctrl);
- queue_work(nvme_wq, &ctrl->async_event_work);
nvme_start_queues(ctrl);
}
}
@@ -3496,6 +3980,8 @@
void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
{
+ nvme_fault_inject_fini(&ctrl->fault_inject);
+ dev_pm_qos_hide_latency_tolerance(ctrl->device);
cdev_device_del(&ctrl->cdev, ctrl->device);
}
EXPORT_SYMBOL_GPL(nvme_uninit_ctrl);
@@ -3506,15 +3992,18 @@
container_of(dev, struct nvme_ctrl, ctrl_device);
struct nvme_subsystem *subsys = ctrl->subsys;
- ida_simple_remove(&nvme_instance_ida, ctrl->instance);
+ if (subsys && ctrl->instance != subsys->instance)
+ ida_simple_remove(&nvme_instance_ida, ctrl->instance);
+
kfree(ctrl->effects);
nvme_mpath_uninit(ctrl);
+ __free_page(ctrl->discard_page);
if (subsys) {
- mutex_lock(&subsys->lock);
+ mutex_lock(&nvme_subsystems_lock);
list_del(&ctrl->subsys_entry);
- mutex_unlock(&subsys->lock);
sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device));
+ mutex_unlock(&nvme_subsystems_lock);
}
ctrl->ops->free_ctrl(ctrl);
@@ -3535,6 +4024,7 @@
ctrl->state = NVME_CTRL_NEW;
spin_lock_init(&ctrl->lock);
+ mutex_init(&ctrl->scan_lock);
INIT_LIST_HEAD(&ctrl->namespaces);
init_rwsem(&ctrl->namespaces_rwsem);
ctrl->dev = dev;
@@ -3544,11 +4034,20 @@
INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
+ init_waitqueue_head(&ctrl->state_wq);
INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
+ BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
+ PAGE_SIZE);
+ ctrl->discard_page = alloc_page(GFP_KERNEL);
+ if (!ctrl->discard_page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
if (ret < 0)
goto out;
@@ -3580,12 +4079,16 @@
dev_pm_qos_update_user_latency_tolerance(ctrl->device,
min(default_ps_max_latency_us, (unsigned long)S32_MAX));
+ nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
+
return 0;
out_free_name:
- kfree_const(dev->kobj.name);
+ kfree_const(ctrl->device->kobj.name);
out_release_instance:
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
out:
+ if (ctrl->discard_page)
+ __free_page(ctrl->discard_page);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
@@ -3604,7 +4107,7 @@
down_read(&ctrl->namespaces_rwsem);
/* Forcibly unquiesce queues to avoid blocking dispatch */
- if (ctrl->admin_q)
+ if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
blk_mq_unquiesce_queue(ctrl->admin_q);
list_for_each_entry(ns, &ctrl->namespaces, list)
@@ -3683,10 +4186,52 @@
}
EXPORT_SYMBOL_GPL(nvme_start_queues);
-int __init nvme_core_init(void)
+
+void nvme_sync_queues(struct nvme_ctrl *ctrl)
+{
+ struct nvme_ns *ns;
+
+ down_read(&ctrl->namespaces_rwsem);
+ list_for_each_entry(ns, &ctrl->namespaces, list)
+ blk_sync_queue(ns->queue);
+ up_read(&ctrl->namespaces_rwsem);
+
+ if (ctrl->admin_q)
+ blk_sync_queue(ctrl->admin_q);
+}
+EXPORT_SYMBOL_GPL(nvme_sync_queues);
+
+/*
+ * Check we didn't inadvertently grow the command structure sizes:
+ */
+static inline void _nvme_check_size(void)
+{
+ BUILD_BUG_ON(sizeof(struct nvme_common_command) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_identify) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_download_firmware) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_dsm_cmd) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_write_zeroes_cmd) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_get_log_page_command) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
+ BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
+ BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+ BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
+}
+
+
+static int __init nvme_core_init(void)
{
int result = -ENOMEM;
+ _nvme_check_size();
+
nvme_wq = alloc_workqueue("nvme-wq",
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
if (!nvme_wq)
@@ -3711,6 +4256,7 @@
result = PTR_ERR(nvme_class);
goto unregister_chrdev;
}
+ nvme_class->dev_uevent = nvme_class_uevent;
nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem");
if (IS_ERR(nvme_subsys_class)) {
@@ -3733,9 +4279,8 @@
return result;
}
-void nvme_core_exit(void)
+static void __exit nvme_core_exit(void)
{
- ida_destroy(&nvme_subsystems_ida);
class_destroy(nvme_subsys_class);
class_destroy(nvme_class);
unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index bcd09d3..74b8818 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* NVMe over Fabrics common host code.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
@@ -158,8 +150,8 @@
cmd.prop_get.fctype = nvme_fabrics_type_property_get;
cmd.prop_get.offset = cpu_to_le32(off);
- ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
- NVME_QID_ANY, 0, 0);
+ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
+ NVME_QID_ANY, 0, 0, false);
if (ret >= 0)
*val = le64_to_cpu(res.u64);
@@ -205,8 +197,8 @@
cmd.prop_get.attrib = 1;
cmd.prop_get.offset = cpu_to_le32(off);
- ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0,
- NVME_QID_ANY, 0, 0);
+ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
+ NVME_QID_ANY, 0, 0, false);
if (ret >= 0)
*val = le64_to_cpu(res.u64);
@@ -251,8 +243,8 @@
cmd.prop_set.offset = cpu_to_le32(off);
cmd.prop_set.value = cpu_to_le64(val);
- ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0,
- NVME_QID_ANY, 0, 0);
+ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0,
+ NVME_QID_ANY, 0, 0, false);
if (unlikely(ret))
dev_err(ctrl->device,
"Property Set error: %d, offset %#x\n",
@@ -389,8 +381,11 @@
* Set keep-alive timeout in seconds granularity (ms * 1000)
* and add a grace period for controller kato enforcement
*/
- cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 :
- cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000);
+ cmd.connect.kato = ctrl->kato ?
+ cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000) : 0;
+
+ if (ctrl->opts->disable_sqflow)
+ cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW;
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
@@ -401,9 +396,9 @@
strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE);
strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE);
- ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res,
+ ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res,
data, sizeof(*data), 0, NVME_QID_ANY, 1,
- BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
+ BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false);
if (ret) {
nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
&cmd, data);
@@ -427,6 +422,7 @@
* @qid: NVMe I/O queue number for the new I/O connection between
* host and target (note qid == 0 is illegal as this is
* the Admin queue, per NVMe standard).
+ * @poll: Whether or not to poll for the completion of the connect cmd.
*
* This function issues a fabrics-protocol connection
* of a NVMe I/O queue (via NVMe Fabrics "Connect" command)
@@ -438,7 +434,7 @@
* > 0: NVMe error status code
* < 0: Linux errno error code
*/
-int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
+int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid, bool poll)
{
struct nvme_command cmd;
struct nvmf_connect_data *data;
@@ -451,6 +447,9 @@
cmd.connect.qid = cpu_to_le16(qid);
cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize);
+ if (ctrl->opts->disable_sqflow)
+ cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW;
+
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
@@ -462,7 +461,7 @@
ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res,
data, sizeof(*data), 0, qid, 1,
- BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
+ BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, poll);
if (ret) {
nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32),
&cmd, data);
@@ -579,7 +578,7 @@
switch (ctrl->state) {
case NVME_CTRL_NEW:
case NVME_CTRL_CONNECTING:
- if (req->cmd->common.opcode == nvme_fabrics_command &&
+ if (nvme_is_fabrics(req->cmd) &&
req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
return true;
break;
@@ -607,6 +606,12 @@
{ NVMF_OPT_HOST_TRADDR, "host_traddr=%s" },
{ NVMF_OPT_HOST_ID, "hostid=%s" },
{ NVMF_OPT_DUP_CONNECT, "duplicate_connect" },
+ { NVMF_OPT_DISABLE_SQFLOW, "disable_sqflow" },
+ { NVMF_OPT_HDR_DIGEST, "hdr_digest" },
+ { NVMF_OPT_DATA_DIGEST, "data_digest" },
+ { NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" },
+ { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" },
+ { NVMF_OPT_TOS, "tos=%d" },
{ NVMF_OPT_ERR, NULL }
};
@@ -626,6 +631,9 @@
opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
opts->kato = NVME_DEFAULT_KATO;
opts->duplicate_connect = false;
+ opts->hdr_digest = false;
+ opts->data_digest = false;
+ opts->tos = -1; /* < 0 == use transport default */
options = o = kstrdup(buf, GFP_KERNEL);
if (!options)
@@ -732,13 +740,6 @@
pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n");
}
opts->kato = token;
-
- if (opts->discovery_nqn && opts->kato) {
- pr_err("Discovery controllers cannot accept KATO != 0\n");
- ret = -EINVAL;
- goto out;
- }
-
break;
case NVMF_OPT_CTRL_LOSS_TMO:
if (match_int(args, &token)) {
@@ -817,6 +818,55 @@
case NVMF_OPT_DUP_CONNECT:
opts->duplicate_connect = true;
break;
+ case NVMF_OPT_DISABLE_SQFLOW:
+ opts->disable_sqflow = true;
+ break;
+ case NVMF_OPT_HDR_DIGEST:
+ opts->hdr_digest = true;
+ break;
+ case NVMF_OPT_DATA_DIGEST:
+ opts->data_digest = true;
+ break;
+ case NVMF_OPT_NR_WRITE_QUEUES:
+ if (match_int(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (token <= 0) {
+ pr_err("Invalid nr_write_queues %d\n", token);
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->nr_write_queues = token;
+ break;
+ case NVMF_OPT_NR_POLL_QUEUES:
+ if (match_int(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (token <= 0) {
+ pr_err("Invalid nr_poll_queues %d\n", token);
+ ret = -EINVAL;
+ goto out;
+ }
+ opts->nr_poll_queues = token;
+ break;
+ case NVMF_OPT_TOS:
+ if (match_int(args, &token)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (token < 0) {
+ pr_err("Invalid type of service %d\n", token);
+ ret = -EINVAL;
+ goto out;
+ }
+ if (token > 255) {
+ pr_warn("Clamping type of service to 255\n");
+ token = 255;
+ }
+ opts->tos = token;
+ break;
default:
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
p);
@@ -826,8 +876,9 @@
}
if (opts->discovery_nqn) {
- opts->kato = 0;
opts->nr_io_queues = 0;
+ opts->nr_write_queues = 0;
+ opts->nr_poll_queues = 0;
opts->duplicate_connect = true;
}
if (ctrl_loss_tmo < 0)
@@ -868,6 +919,36 @@
return 0;
}
+bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
+ struct nvmf_ctrl_options *opts)
+{
+ if (!nvmf_ctlr_matches_baseopts(ctrl, opts) ||
+ strcmp(opts->traddr, ctrl->opts->traddr) ||
+ strcmp(opts->trsvcid, ctrl->opts->trsvcid))
+ return false;
+
+ /*
+ * Checking the local address is rough. In most cases, none is specified
+ * and the host port is selected by the stack.
+ *
+ * Assume no match if:
+ * - local address is specified and address is not the same
+ * - local address is not specified but remote is, or vice versa
+ * (admin using specific host_traddr when it matters).
+ */
+ if ((opts->mask & NVMF_OPT_HOST_TRADDR) &&
+ (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR)) {
+ if (strcmp(opts->host_traddr, ctrl->opts->host_traddr))
+ return false;
+ } else if ((opts->mask & NVMF_OPT_HOST_TRADDR) ||
+ (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR)) {
+ return false;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(nvmf_ip_options_match);
+
static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts,
unsigned int allowed_opts)
{
@@ -903,10 +984,11 @@
#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN)
#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
- NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT)
+ NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
+ NVMF_OPT_DISABLE_SQFLOW)
static struct nvme_ctrl *
-nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
+nvmf_create_ctrl(struct device *dev, const char *buf)
{
struct nvmf_ctrl_options *opts;
struct nvmf_transport_ops *ops;
@@ -1001,7 +1083,7 @@
goto out_unlock;
}
- ctrl = nvmf_create_ctrl(nvmf_device, buf, count);
+ ctrl = nvmf_create_ctrl(nvmf_device, buf);
if (IS_ERR(ctrl)) {
ret = PTR_ERR(ctrl);
goto out_unlock;
@@ -1116,6 +1198,7 @@
class_destroy(nvmf_class);
nvmf_host_put(nvmf_default_host);
+ BUILD_BUG_ON(sizeof(struct nvmf_common_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64);
BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index aa2fdb2..a0ec40a 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -1,15 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* NVMe over Fabrics common host code.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
*/
#ifndef _NVME_FABRICS_H
#define _NVME_FABRICS_H 1
@@ -58,6 +50,12 @@
NVMF_OPT_CTRL_LOSS_TMO = 1 << 11,
NVMF_OPT_HOST_ID = 1 << 12,
NVMF_OPT_DUP_CONNECT = 1 << 13,
+ NVMF_OPT_DISABLE_SQFLOW = 1 << 14,
+ NVMF_OPT_HDR_DIGEST = 1 << 15,
+ NVMF_OPT_DATA_DIGEST = 1 << 16,
+ NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
+ NVMF_OPT_NR_POLL_QUEUES = 1 << 18,
+ NVMF_OPT_TOS = 1 << 19,
};
/**
@@ -85,6 +83,12 @@
* @max_reconnects: maximum number of allowed reconnect attempts before removing
* the controller, (-1) means reconnect forever, zero means remove
* immediately;
+ * @disable_sqflow: disable controller sq flow control
+ * @hdr_digest: generate/verify header digest (TCP)
+ * @data_digest: generate/verify data digest (TCP)
+ * @nr_write_queues: number of queues for write I/O
+ * @nr_poll_queues: number of queues for polling I/O
+ * @tos: type of service
*/
struct nvmf_ctrl_options {
unsigned mask;
@@ -101,6 +105,12 @@
unsigned int kato;
struct nvmf_host *host;
int max_reconnects;
+ bool disable_sqflow;
+ bool hdr_digest;
+ bool data_digest;
+ unsigned int nr_write_queues;
+ unsigned int nr_poll_queues;
+ int tos;
};
/*
@@ -156,7 +166,7 @@
int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
-int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid);
+int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid, bool poll);
int nvmf_register_transport(struct nvmf_transport_ops *ops);
void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
void nvmf_free_options(struct nvmf_ctrl_options *opts);
@@ -166,12 +176,13 @@
struct request *rq);
bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
bool queue_live);
+bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
+ struct nvmf_ctrl_options *opts);
static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
bool queue_live)
{
- if (likely(ctrl->state == NVME_CTRL_LIVE ||
- ctrl->state == NVME_CTRL_ADMIN_ONLY))
+ if (likely(ctrl->state == NVME_CTRL_LIVE))
return true;
return __nvmf_check_ready(ctrl, rq, queue_live);
}
diff --git a/drivers/nvme/host/fault_inject.c b/drivers/nvme/host/fault_inject.c
index 0263226..1352159 100644
--- a/drivers/nvme/host/fault_inject.c
+++ b/drivers/nvme/host/fault_inject.c
@@ -1,8 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* fault injection support for nvme.
*
* Copyright (c) 2018, Oracle and/or its affiliates
- *
*/
#include <linux/moduleparam.h>
@@ -15,11 +15,10 @@
static char *fail_request;
module_param(fail_request, charp, 0000);
-void nvme_fault_inject_init(struct nvme_ns *ns)
+void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj,
+ const char *dev_name)
{
struct dentry *dir, *parent;
- char *name = ns->disk->disk_name;
- struct nvme_fault_inject *fault_inj = &ns->fault_inject;
struct fault_attr *attr = &fault_inj->attr;
/* set default fault injection attribute */
@@ -27,20 +26,20 @@
setup_fault_attr(&fail_default_attr, fail_request);
/* create debugfs directory and attribute */
- parent = debugfs_create_dir(name, NULL);
+ parent = debugfs_create_dir(dev_name, NULL);
if (!parent) {
- pr_warn("%s: failed to create debugfs directory\n", name);
+ pr_warn("%s: failed to create debugfs directory\n", dev_name);
return;
}
*attr = fail_default_attr;
dir = fault_create_debugfs_attr("fault_inject", parent, attr);
if (IS_ERR(dir)) {
- pr_warn("%s: failed to create debugfs attr\n", name);
+ pr_warn("%s: failed to create debugfs attr\n", dev_name);
debugfs_remove_recursive(parent);
return;
}
- ns->fault_inject.parent = parent;
+ fault_inj->parent = parent;
/* create debugfs for status code and dont_retry */
fault_inj->status = NVME_SC_INVALID_OPCODE;
@@ -49,29 +48,33 @@
debugfs_create_bool("dont_retry", 0600, dir, &fault_inj->dont_retry);
}
-void nvme_fault_inject_fini(struct nvme_ns *ns)
+void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inject)
{
/* remove debugfs directories */
- debugfs_remove_recursive(ns->fault_inject.parent);
+ debugfs_remove_recursive(fault_inject->parent);
}
void nvme_should_fail(struct request *req)
{
struct gendisk *disk = req->rq_disk;
- struct nvme_ns *ns = NULL;
+ struct nvme_fault_inject *fault_inject = NULL;
u16 status;
- /*
- * make sure this request is coming from a valid namespace
- */
- if (!disk)
- return;
+ if (disk) {
+ struct nvme_ns *ns = disk->private_data;
- ns = disk->private_data;
- if (ns && should_fail(&ns->fault_inject.attr, 1)) {
+ if (ns)
+ fault_inject = &ns->fault_inject;
+ else
+ WARN_ONCE(1, "No namespace found for request\n");
+ } else {
+ fault_inject = &nvme_req(req)->ctrl->fault_inject;
+ }
+
+ if (fault_inject && should_fail(&fault_inject->attr, 1)) {
/* inject status code and DNR bit */
- status = ns->fault_inject.status;
- if (ns->fault_inject.dont_retry)
+ status = fault_inject->status;
+ if (fault_inject->dont_retry)
status |= NVME_SC_DNR;
nvme_req(req)->status = status;
}
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 9375fa7..265f89e 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1,18 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2016 Avago Technologies. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of version 2 of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful.
- * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES,
- * INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A
- * PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE DISCLAIMED, EXCEPT TO
- * THE EXTENT THAT SUCH DISCLAIMERS ARE HELD TO BE LEGALLY INVALID.
- * See the GNU General Public License for more details, a copy of which
- * can be found in the file COPYING included with this package
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -20,12 +8,13 @@
#include <uapi/scsi/fc/fc_fs.h>
#include <uapi/scsi/fc/fc_els.h>
#include <linux/delay.h>
+#include <linux/overflow.h>
#include "nvme.h"
#include "fabrics.h"
#include <linux/nvme-fc-driver.h>
#include <linux/nvme-fc.h>
-
+#include <scsi/scsi_transport_fc.h>
/* *************************** Data Structures/Defines ****************** */
@@ -104,6 +93,12 @@
struct nvme_fc_ersp_iu rsp_iu;
};
+struct nvme_fcp_op_w_sgl {
+ struct nvme_fc_fcp_op op;
+ struct scatterlist sgl[SG_CHUNK_SIZE];
+ uint8_t priv[0];
+};
+
struct nvme_fc_lport {
struct nvme_fc_local_port localport;
@@ -122,6 +117,7 @@
struct list_head endp_list; /* for lport->endp_list */
struct list_head ctrl_list;
struct list_head ls_req_list;
+ struct list_head disc_list;
struct device *dev; /* physical device for dma */
struct nvme_fc_lport *lport;
spinlock_t lock;
@@ -206,13 +202,15 @@
static DEFINE_IDA(nvme_fc_local_port_cnt);
static DEFINE_IDA(nvme_fc_ctrl_cnt);
+static struct workqueue_struct *nvme_fc_wq;
+static bool nvme_fc_waiting_to_unload;
+static DECLARE_COMPLETION(nvme_fc_unload_proceed);
/*
* These items are short-term. They will eventually be moved into
* a generic FC class. See comments in module init.
*/
-static struct class *fc_class;
static struct device *fc_udev_device;
@@ -234,6 +232,8 @@
/* remove from transport list */
spin_lock_irqsave(&nvme_fc_lock, flags);
list_del(&lport->port_list);
+ if (nvme_fc_waiting_to_unload && list_empty(&nvme_fc_lport_list))
+ complete(&nvme_fc_unload_proceed);
spin_unlock_irqrestore(&nvme_fc_lock, flags);
ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num);
@@ -319,7 +319,7 @@
* @template: LLDD entrypoints and operational parameters for the port
* @dev: physical hardware device node port corresponds to. Will be
* used for DMA mappings
- * @lport_p: pointer to a local port pointer. Upon success, the routine
+ * @portptr: pointer to a local port pointer. Upon success, the routine
* will allocate a nvme_fc_local_port structure and place its
* address in the local port pointer. Upon failure, local port
* pointer will be set to 0.
@@ -427,8 +427,7 @@
* nvme_fc_unregister_localport - transport entry point called by an
* LLDD to deregister/remove a previously
* registered a NVME host FC port.
- * @localport: pointer to the (registered) local port that is to be
- * deregistered.
+ * @portptr: pointer to the (registered) local port that is to be deregistered.
*
* Returns:
* a completion status. Must be 0 upon success; a negative errno
@@ -509,6 +508,7 @@
list_del(&rport->endp_list);
spin_unlock_irqrestore(&nvme_fc_lock, flags);
+ WARN_ON(!list_empty(&rport->disc_list));
ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num);
kfree(rport);
@@ -633,7 +633,7 @@
* @localport: pointer to the (registered) local port that the remote
* subsystem port is connected to.
* @pinfo: pointer to information about the port to be registered
- * @rport_p: pointer to a remote port pointer. Upon success, the routine
+ * @portptr: pointer to a remote port pointer. Upon success, the routine
* will allocate a nvme_fc_remote_port structure and place its
* address in the remote port pointer. Upon failure, remote port
* pointer will be set to 0.
@@ -696,6 +696,7 @@
INIT_LIST_HEAD(&newrec->endp_list);
INIT_LIST_HEAD(&newrec->ctrl_list);
INIT_LIST_HEAD(&newrec->ls_req_list);
+ INIT_LIST_HEAD(&newrec->disc_list);
kref_init(&newrec->ref);
atomic_set(&newrec->act_ctrl_cnt, 0);
spin_lock_init(&newrec->lock);
@@ -809,8 +810,8 @@
* nvme_fc_unregister_remoteport - transport entry point called by an
* LLDD to deregister/remove a previously
* registered a NVME subsystem FC port.
- * @remoteport: pointer to the (registered) remote port that is to be
- * deregistered.
+ * @portptr: pointer to the (registered) remote port that is to be
+ * deregistered.
*
* Returns:
* a completion status. Must be 0 upon success; a negative errno
@@ -1387,7 +1388,7 @@
__nvme_fc_finish_ls_req(lsop);
- /* fc-nvme iniator doesn't care about success or failure of cmd */
+ /* fc-nvme initiator doesn't care about success or failure of cmd */
kfree(lsop);
}
@@ -1607,9 +1608,13 @@
sizeof(op->rsp_iu), DMA_FROM_DEVICE);
if (opstate == FCPOP_STATE_ABORTED)
- status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
- else if (freq->status)
- status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+ status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ else if (freq->status) {
+ status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: io failed due to lldd error %d\n",
+ ctrl->cnum, freq->status);
+ }
/*
* For the linux implementation, if we have an unsuccesful
@@ -1636,8 +1641,13 @@
* no payload in the CQE by the transport.
*/
if (freq->transferred_length !=
- be32_to_cpu(op->cmd_iu.data_len)) {
- status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+ be32_to_cpu(op->cmd_iu.data_len)) {
+ status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: io failed due to bad transfer "
+ "length: %d vs expected %d\n",
+ ctrl->cnum, freq->transferred_length,
+ be32_to_cpu(op->cmd_iu.data_len));
goto done;
}
result.u64 = 0;
@@ -1654,7 +1664,17 @@
freq->transferred_length ||
op->rsp_iu.status_code ||
sqe->common.command_id != cqe->command_id)) {
- status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+ status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: io failed due to bad NVMe_ERSP: "
+ "iu len %d, xfr len %d vs %d, status code "
+ "%d, cmdid %d vs %d\n",
+ ctrl->cnum, be16_to_cpu(op->rsp_iu.iu_len),
+ be32_to_cpu(op->rsp_iu.xfrd_len),
+ freq->transferred_length,
+ op->rsp_iu.status_code,
+ sqe->common.command_id,
+ cqe->command_id);
goto done;
}
result = cqe->result;
@@ -1662,7 +1682,11 @@
break;
default:
- status = cpu_to_le16(NVME_SC_INTERNAL << 1);
+ status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: io failed due to odd NVMe_xRSP iu "
+ "len %d\n",
+ ctrl->cnum, freq->rcv_rsplen);
goto done;
}
@@ -1691,6 +1715,8 @@
struct nvme_fc_queue *queue, struct nvme_fc_fcp_op *op,
struct request *rq, u32 rqno)
{
+ struct nvme_fcp_op_w_sgl *op_w_sgl =
+ container_of(op, typeof(*op_w_sgl), op);
struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
int ret = 0;
@@ -1700,8 +1726,6 @@
op->fcp_req.rspaddr = &op->rsp_iu;
op->fcp_req.rsplen = sizeof(op->rsp_iu);
op->fcp_req.done = nvme_fc_fcpio_done;
- op->fcp_req.first_sgl = (struct scatterlist *)&op[1];
- op->fcp_req.private = &op->fcp_req.first_sgl[SG_CHUNK_SIZE];
op->ctrl = ctrl;
op->queue = queue;
op->rq = rq;
@@ -1739,12 +1763,18 @@
unsigned int hctx_idx, unsigned int numa_node)
{
struct nvme_fc_ctrl *ctrl = set->driver_data;
- struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
+ struct nvme_fcp_op_w_sgl *op = blk_mq_rq_to_pdu(rq);
int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
struct nvme_fc_queue *queue = &ctrl->queues[queue_idx];
+ int res;
+ res = __nvme_fc_init_request(ctrl, queue, &op->op, rq, queue->rqcnt++);
+ if (res)
+ return res;
+ op->op.fcp_req.first_sgl = &op->sgl[0];
+ op->op.fcp_req.private = &op->priv[0];
nvme_req(rq)->ctrl = &ctrl->ctrl;
- return __nvme_fc_init_request(ctrl, queue, op, rq, queue->rqcnt++);
+ return res;
}
static int
@@ -1774,7 +1804,6 @@
}
aen_op->flags = FCOP_FLAGS_AEN;
- aen_op->fcp_req.first_sgl = NULL; /* no sg list */
aen_op->fcp_req.private = private;
memset(sqe, 0, sizeof(*sqe));
@@ -1844,7 +1873,7 @@
memset(queue, 0, sizeof(*queue));
queue->ctrl = ctrl;
queue->qnum = idx;
- atomic_set(&queue->csn, 1);
+ atomic_set(&queue->csn, 0);
queue->dev = ctrl->dev;
if (idx > 0)
@@ -1886,7 +1915,7 @@
*/
queue->connection_id = 0;
- atomic_set(&queue->csn, 1);
+ atomic_set(&queue->csn, 0);
}
static void
@@ -1962,7 +1991,7 @@
(qsize / 5));
if (ret)
break;
- ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
+ ret = nvmf_connect_io_queue(&ctrl->ctrl, i, false);
if (ret)
break;
@@ -2000,6 +2029,7 @@
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
blk_cleanup_queue(ctrl->ctrl.admin_q);
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
blk_mq_free_tag_set(&ctrl->admin_tag_set);
kfree(ctrl->queues);
@@ -2053,7 +2083,7 @@
*/
if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) {
active = atomic_xchg(&ctrl->err_work_active, 1);
- if (!active && !schedule_work(&ctrl->err_work)) {
+ if (!active && !queue_work(nvme_fc_wq, &ctrl->err_work)) {
atomic_set(&ctrl->err_work_active, 0);
WARN_ON(1);
}
@@ -2101,27 +2131,26 @@
struct nvme_fc_fcp_op *op)
{
struct nvmefc_fcp_req *freq = &op->fcp_req;
- enum dma_data_direction dir;
int ret;
freq->sg_cnt = 0;
- if (!blk_rq_payload_bytes(rq))
+ if (!blk_rq_nr_phys_segments(rq))
return 0;
freq->sg_table.sgl = freq->first_sgl;
ret = sg_alloc_table_chained(&freq->sg_table,
- blk_rq_nr_phys_segments(rq), freq->sg_table.sgl);
+ blk_rq_nr_phys_segments(rq), freq->sg_table.sgl,
+ SG_CHUNK_SIZE);
if (ret)
return -ENOMEM;
op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl);
WARN_ON(op->nents > blk_rq_nr_phys_segments(rq));
- dir = (rq_data_dir(rq) == WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl,
- op->nents, dir);
+ op->nents, rq_dma_dir(rq));
if (unlikely(freq->sg_cnt <= 0)) {
- sg_free_table_chained(&freq->sg_table, true);
+ sg_free_table_chained(&freq->sg_table, SG_CHUNK_SIZE);
freq->sg_cnt = 0;
return -EFAULT;
}
@@ -2142,12 +2171,11 @@
return;
fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents,
- ((rq_data_dir(rq) == WRITE) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE));
+ rq_dma_dir(rq));
nvme_cleanup_cmd(rq);
- sg_free_table_chained(&freq->sg_table, true);
+ sg_free_table_chained(&freq->sg_table, SG_CHUNK_SIZE);
freq->sg_cnt = 0;
}
@@ -2182,7 +2210,6 @@
{
struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
struct nvme_command *sqe = &cmdiu->sqe;
- u32 csn;
int ret, opstate;
/*
@@ -2197,8 +2224,6 @@
/* format the FC-NVME CMD IU and fcp_req */
cmdiu->connection_id = cpu_to_be64(queue->connection_id);
- csn = atomic_inc_return(&queue->csn);
- cmdiu->csn = cpu_to_be32(csn);
cmdiu->data_len = cpu_to_be32(data_len);
switch (io_dir) {
case NVMEFC_FCP_WRITE:
@@ -2256,11 +2281,24 @@
if (!(op->flags & FCOP_FLAGS_AEN))
blk_mq_start_request(op->rq);
+ cmdiu->csn = cpu_to_be32(atomic_inc_return(&queue->csn));
ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport,
&ctrl->rport->remoteport,
queue->lldd_handle, &op->fcp_req);
if (ret) {
+ /*
+ * If the lld fails to send the command is there an issue with
+ * the csn value? If the command that fails is the Connect,
+ * no - as the connection won't be live. If it is a command
+ * post-connect, it's possible a gap in csn may be created.
+ * Does this matter? As Linux initiators don't send fused
+ * commands, no. The gap would exist, but as there's nothing
+ * that depends on csn order to be delivered on the target
+ * side, it shouldn't hurt. It would be difficult for a
+ * target to even detect the csn gap as it has no idea when the
+ * cmd with the csn was supposed to arrive.
+ */
opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE);
__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
@@ -2303,48 +2341,27 @@
if (ret)
return ret;
- data_len = blk_rq_payload_bytes(rq);
- if (data_len)
+ /*
+ * nvme core doesn't quite treat the rq opaquely. Commands such
+ * as WRITE ZEROES will return a non-zero rq payload_bytes yet
+ * there is no actual payload to be transferred.
+ * To get it right, key data transmission on there being 1 or
+ * more physical segments in the sg list. If there is no
+ * physical segments, there is no payload.
+ */
+ if (blk_rq_nr_phys_segments(rq)) {
+ data_len = blk_rq_payload_bytes(rq);
io_dir = ((rq_data_dir(rq) == WRITE) ?
NVMEFC_FCP_WRITE : NVMEFC_FCP_READ);
- else
+ } else {
+ data_len = 0;
io_dir = NVMEFC_FCP_NODATA;
+ }
+
return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir);
}
-static struct blk_mq_tags *
-nvme_fc_tagset(struct nvme_fc_queue *queue)
-{
- if (queue->qnum == 0)
- return queue->ctrl->admin_tag_set.tags[queue->qnum];
-
- return queue->ctrl->tag_set.tags[queue->qnum - 1];
-}
-
-static int
-nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
-
-{
- struct nvme_fc_queue *queue = hctx->driver_data;
- struct nvme_fc_ctrl *ctrl = queue->ctrl;
- struct request *req;
- struct nvme_fc_fcp_op *op;
-
- req = blk_mq_tag_to_rq(nvme_fc_tagset(queue), tag);
- if (!req)
- return 0;
-
- op = blk_mq_rq_to_pdu(req);
-
- if ((atomic_read(&op->state) == FCPOP_STATE_ACTIVE) &&
- (ctrl->lport->ops->poll_queue))
- ctrl->lport->ops->poll_queue(&ctrl->lport->localport,
- queue->lldd_handle);
-
- return ((atomic_read(&op->state) != FCPOP_STATE_ACTIVE));
-}
-
static void
nvme_fc_submit_async_event(struct nvme_ctrl *arg)
{
@@ -2397,7 +2414,7 @@
* status. The done path will return the io request back to the block
* layer with an error status.
*/
-static void
+static bool
nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
{
struct nvme_ctrl *nctrl = data;
@@ -2405,6 +2422,7 @@
struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
__nvme_fc_abort_op(ctrl, op);
+ return true;
}
@@ -2414,7 +2432,6 @@
.init_request = nvme_fc_init_request,
.exit_request = nvme_fc_exit_request,
.init_hctx = nvme_fc_init_hctx,
- .poll = nvme_fc_poll,
.timeout = nvme_fc_timeout,
};
@@ -2444,12 +2461,11 @@
ctrl->tag_set.ops = &nvme_fc_mq_ops;
ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
ctrl->tag_set.reserved_tags = 1; /* fabric connect */
- ctrl->tag_set.numa_node = NUMA_NO_NODE;
+ ctrl->tag_set.numa_node = ctrl->ctrl.numa_node;
ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
- ctrl->tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
- (SG_CHUNK_SIZE *
- sizeof(struct scatterlist)) +
- ctrl->lport->ops->fcprqst_priv_sz;
+ ctrl->tag_set.cmd_size =
+ struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
+ ctrl->lport->ops->fcprqst_priv_sz);
ctrl->tag_set.driver_data = ctrl;
ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1;
ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
@@ -2496,6 +2512,7 @@
nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl)
{
struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+ u32 prior_ioq_cnt = ctrl->ctrl.queue_count - 1;
unsigned int nr_io_queues;
int ret;
@@ -2508,6 +2525,13 @@
return ret;
}
+ if (!nr_io_queues && prior_ioq_cnt) {
+ dev_info(ctrl->ctrl.device,
+ "Fail Reconnect: At least 1 io queue "
+ "required (was %d)\n", prior_ioq_cnt);
+ return -ENOSPC;
+ }
+
ctrl->ctrl.queue_count = nr_io_queues + 1;
/* check for io queues existing */
if (ctrl->ctrl.queue_count == 1)
@@ -2521,6 +2545,10 @@
if (ret)
goto out_delete_hw_queues;
+ if (prior_ioq_cnt != nr_io_queues)
+ dev_info(ctrl->ctrl.device,
+ "reconnect: revising io queue count from %d to %d\n",
+ prior_ioq_cnt, nr_io_queues);
blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues);
return 0;
@@ -2606,6 +2634,12 @@
if (nvme_fc_ctlr_active_on_rport(ctrl))
return -ENOTUNIQ;
+ dev_info(ctrl->ctrl.device,
+ "NVME-FC{%d}: create association : host wwpn 0x%016llx "
+ " rport wwpn 0x%016llx: NQN \"%s\"\n",
+ ctrl->cnum, ctrl->lport->localport.port_name,
+ ctrl->rport->remoteport.port_name, ctrl->ctrl.opts->subsysnqn);
+
/*
* Create the admin queue
*/
@@ -2620,8 +2654,6 @@
if (ret)
goto out_delete_hw_queue;
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
-
ret = nvmf_connect_admin_queue(&ctrl->ctrl);
if (ret)
goto out_disconnect_admin_queue;
@@ -2635,23 +2667,15 @@
* prior connection values
*/
- ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap);
- if (ret) {
- dev_err(ctrl->ctrl.device,
- "prop_get NVME_REG_CAP failed\n");
- goto out_disconnect_admin_queue;
- }
-
- ctrl->ctrl.sqsize =
- min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
-
- ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+ ret = nvme_enable_ctrl(&ctrl->ctrl);
if (ret)
goto out_disconnect_admin_queue;
ctrl->ctrl.max_hw_sectors =
(ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9);
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+
ret = nvme_init_identify(&ctrl->ctrl);
if (ret)
goto out_disconnect_admin_queue;
@@ -2761,6 +2785,7 @@
nvme_stop_queues(&ctrl->ctrl);
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
+ blk_mq_tagset_wait_completed_request(&ctrl->tag_set);
}
/*
@@ -2783,6 +2808,7 @@
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
+ blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
/* kill the aens as they are a separate path */
nvme_fc_abort_aen_ops(ctrl);
@@ -3038,6 +3064,10 @@
ctrl->ctrl.opts = opts;
ctrl->ctrl.nr_reconnects = 0;
+ if (lport->dev)
+ ctrl->ctrl.numa_node = dev_to_node(lport->dev);
+ else
+ ctrl->ctrl.numa_node = NUMA_NO_NODE;
INIT_LIST_HEAD(&ctrl->ctrl_list);
ctrl->lport = lport;
ctrl->rport = rport;
@@ -3078,11 +3108,10 @@
ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
- ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
- ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
- (SG_CHUNK_SIZE *
- sizeof(struct scatterlist)) +
- ctrl->lport->ops->fcprqst_priv_sz;
+ ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node;
+ ctrl->admin_tag_set.cmd_size =
+ struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
+ ctrl->lport->ops->fcprqst_priv_sz);
ctrl->admin_tag_set.driver_data = ctrl;
ctrl->admin_tag_set.nr_hw_queues = 1;
ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
@@ -3093,10 +3122,16 @@
goto out_free_queues;
ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set;
+ ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
+ if (IS_ERR(ctrl->ctrl.fabrics_q)) {
+ ret = PTR_ERR(ctrl->ctrl.fabrics_q);
+ goto out_free_admin_tag_set;
+ }
+
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.admin_q)) {
ret = PTR_ERR(ctrl->ctrl.admin_q);
- goto out_free_admin_tag_set;
+ goto out_cleanup_fabrics_q;
}
/*
@@ -3168,6 +3203,8 @@
out_cleanup_admin_q:
blk_cleanup_queue(ctrl->ctrl.admin_q);
+out_cleanup_fabrics_q:
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
out_free_admin_tag_set:
blk_mq_free_tag_set(&ctrl->admin_tag_set);
out_free_queues:
@@ -3212,7 +3249,7 @@
substring_t wwn = { name, &name[sizeof(name)-1] };
int nnoffset, pnoffset;
- /* validate it string one of the 2 allowed formats */
+ /* validate if string is one of the 2 allowed formats */
if (strnlen(buf, blen) == NVME_FC_TRADDR_MAXLENGTH &&
!strncmp(buf, "nn-0x", NVME_FC_TRADDR_OXNNLEN) &&
!strncmp(&buf[NVME_FC_TRADDR_MAX_PN_OFFSET],
@@ -3307,10 +3344,98 @@
.create_ctrl = nvme_fc_create_ctrl,
};
+/* Arbitrary successive failures max. With lots of subsystems could be high */
+#define DISCOVERY_MAX_FAIL 20
+
+static ssize_t nvme_fc_nvme_discovery_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ unsigned long flags;
+ LIST_HEAD(local_disc_list);
+ struct nvme_fc_lport *lport;
+ struct nvme_fc_rport *rport;
+ int failcnt = 0;
+
+ spin_lock_irqsave(&nvme_fc_lock, flags);
+restart:
+ list_for_each_entry(lport, &nvme_fc_lport_list, port_list) {
+ list_for_each_entry(rport, &lport->endp_list, endp_list) {
+ if (!nvme_fc_lport_get(lport))
+ continue;
+ if (!nvme_fc_rport_get(rport)) {
+ /*
+ * This is a temporary condition. Upon restart
+ * this rport will be gone from the list.
+ *
+ * Revert the lport put and retry. Anything
+ * added to the list already will be skipped (as
+ * they are no longer list_empty). Loops should
+ * resume at rports that were not yet seen.
+ */
+ nvme_fc_lport_put(lport);
+
+ if (failcnt++ < DISCOVERY_MAX_FAIL)
+ goto restart;
+
+ pr_err("nvme_discovery: too many reference "
+ "failures\n");
+ goto process_local_list;
+ }
+ if (list_empty(&rport->disc_list))
+ list_add_tail(&rport->disc_list,
+ &local_disc_list);
+ }
+ }
+
+process_local_list:
+ while (!list_empty(&local_disc_list)) {
+ rport = list_first_entry(&local_disc_list,
+ struct nvme_fc_rport, disc_list);
+ list_del_init(&rport->disc_list);
+ spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+ lport = rport->lport;
+ /* signal discovery. Won't hurt if it repeats */
+ nvme_fc_signal_discovery_scan(lport, rport);
+ nvme_fc_rport_put(rport);
+ nvme_fc_lport_put(lport);
+
+ spin_lock_irqsave(&nvme_fc_lock, flags);
+ }
+ spin_unlock_irqrestore(&nvme_fc_lock, flags);
+
+ return count;
+}
+static DEVICE_ATTR(nvme_discovery, 0200, NULL, nvme_fc_nvme_discovery_store);
+
+static struct attribute *nvme_fc_attrs[] = {
+ &dev_attr_nvme_discovery.attr,
+ NULL
+};
+
+static struct attribute_group nvme_fc_attr_group = {
+ .attrs = nvme_fc_attrs,
+};
+
+static const struct attribute_group *nvme_fc_attr_groups[] = {
+ &nvme_fc_attr_group,
+ NULL
+};
+
+static struct class fc_class = {
+ .name = "fc",
+ .dev_groups = nvme_fc_attr_groups,
+ .owner = THIS_MODULE,
+};
+
static int __init nvme_fc_init_module(void)
{
int ret;
+ nvme_fc_wq = alloc_workqueue("nvme_fc_wq", WQ_MEM_RECLAIM, 0);
+ if (!nvme_fc_wq)
+ return -ENOMEM;
+
/*
* NOTE:
* It is expected that in the future the kernel will combine
@@ -3325,16 +3450,16 @@
* put in place, this code will move to a more generic
* location for the class.
*/
- fc_class = class_create(THIS_MODULE, "fc");
- if (IS_ERR(fc_class)) {
+ ret = class_register(&fc_class);
+ if (ret) {
pr_err("couldn't register class fc\n");
- return PTR_ERR(fc_class);
+ goto out_destroy_wq;
}
/*
* Create a device for the FC-centric udev events
*/
- fc_udev_device = device_create(fc_class, NULL, MKDEV(0, 0), NULL,
+ fc_udev_device = device_create(&fc_class, NULL, MKDEV(0, 0), NULL,
"fc_udev_device");
if (IS_ERR(fc_udev_device)) {
pr_err("couldn't create fc_udev device!\n");
@@ -3349,25 +3474,69 @@
return 0;
out_destroy_device:
- device_destroy(fc_class, MKDEV(0, 0));
+ device_destroy(&fc_class, MKDEV(0, 0));
out_destroy_class:
- class_destroy(fc_class);
+ class_unregister(&fc_class);
+out_destroy_wq:
+ destroy_workqueue(nvme_fc_wq);
+
return ret;
}
+static void
+nvme_fc_delete_controllers(struct nvme_fc_rport *rport)
+{
+ struct nvme_fc_ctrl *ctrl;
+
+ spin_lock(&rport->lock);
+ list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
+ dev_warn(ctrl->ctrl.device,
+ "NVME-FC{%d}: transport unloading: deleting ctrl\n",
+ ctrl->cnum);
+ nvme_delete_ctrl(&ctrl->ctrl);
+ }
+ spin_unlock(&rport->lock);
+}
+
+static void
+nvme_fc_cleanup_for_unload(void)
+{
+ struct nvme_fc_lport *lport;
+ struct nvme_fc_rport *rport;
+
+ list_for_each_entry(lport, &nvme_fc_lport_list, port_list) {
+ list_for_each_entry(rport, &lport->endp_list, endp_list) {
+ nvme_fc_delete_controllers(rport);
+ }
+ }
+}
+
static void __exit nvme_fc_exit_module(void)
{
- /* sanity check - all lports should be removed */
- if (!list_empty(&nvme_fc_lport_list))
- pr_warn("%s: localport list not empty\n", __func__);
+ unsigned long flags;
+ bool need_cleanup = false;
+
+ spin_lock_irqsave(&nvme_fc_lock, flags);
+ nvme_fc_waiting_to_unload = true;
+ if (!list_empty(&nvme_fc_lport_list)) {
+ need_cleanup = true;
+ nvme_fc_cleanup_for_unload();
+ }
+ spin_unlock_irqrestore(&nvme_fc_lock, flags);
+ if (need_cleanup) {
+ pr_info("%s: waiting for ctlr deletes\n", __func__);
+ wait_for_completion(&nvme_fc_unload_proceed);
+ pr_info("%s: ctrl deletes complete\n", __func__);
+ }
nvmf_unregister_transport(&nvme_fc_transport);
ida_destroy(&nvme_fc_local_port_cnt);
ida_destroy(&nvme_fc_ctrl_cnt);
- device_destroy(fc_class, MKDEV(0, 0));
- class_destroy(fc_class);
+ device_destroy(&fc_class, MKDEV(0, 0));
+ class_unregister(&fc_class);
+ destroy_workqueue(nvme_fc_wq);
}
module_init(nvme_fc_init_module);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 6fe5923..ec46693 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -1,23 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* nvme-lightnvm.c - LightNVM NVMe device
*
* Copyright (C) 2014-2015 IT University of Copenhagen
* Initial release: Matias Bjorling <mb@lightnvm.io>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING. If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- *
*/
#include "nvme.h"
@@ -567,17 +553,18 @@
* Expect the lba in device format
*/
static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
- struct nvm_chk_meta *meta,
- sector_t slba, int nchks)
+ sector_t slba, int nchks,
+ struct nvm_chk_meta *meta)
{
struct nvm_geo *geo = &ndev->geo;
struct nvme_ns *ns = ndev->q->queuedata;
struct nvme_ctrl *ctrl = ns->ctrl;
- struct nvme_nvm_chk_meta *dev_meta = (struct nvme_nvm_chk_meta *)meta;
+ struct nvme_nvm_chk_meta *dev_meta, *dev_meta_off;
struct ppa_addr ppa;
size_t left = nchks * sizeof(struct nvme_nvm_chk_meta);
size_t log_pos, offset, len;
- int ret, i, max_len;
+ int i, max_len;
+ int ret = 0;
/*
* limit requests to maximum 256K to avoid issuing arbitrary large
@@ -585,6 +572,10 @@
*/
max_len = min_t(unsigned int, ctrl->max_hw_sectors << 9, 256 * 1024);
+ dev_meta = kmalloc(max_len, GFP_KERNEL);
+ if (!dev_meta)
+ return -ENOMEM;
+
/* Normalize lba address space to obtain log offset */
ppa.ppa = slba;
ppa = dev_to_generic_addr(ndev, ppa);
@@ -598,6 +589,9 @@
while (left) {
len = min_t(unsigned int, left, max_len);
+ memset(dev_meta, 0, max_len);
+ dev_meta_off = dev_meta;
+
ret = nvme_get_log(ctrl, ns->head->ns_id,
NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len,
offset);
@@ -607,21 +601,23 @@
}
for (i = 0; i < len; i += sizeof(struct nvme_nvm_chk_meta)) {
- meta->state = dev_meta->state;
- meta->type = dev_meta->type;
- meta->wi = dev_meta->wi;
- meta->slba = le64_to_cpu(dev_meta->slba);
- meta->cnlb = le64_to_cpu(dev_meta->cnlb);
- meta->wp = le64_to_cpu(dev_meta->wp);
+ meta->state = dev_meta_off->state;
+ meta->type = dev_meta_off->type;
+ meta->wi = dev_meta_off->wi;
+ meta->slba = le64_to_cpu(dev_meta_off->slba);
+ meta->cnlb = le64_to_cpu(dev_meta_off->cnlb);
+ meta->wp = le64_to_cpu(dev_meta_off->wp);
meta++;
- dev_meta++;
+ dev_meta_off++;
}
offset += len;
left -= len;
}
+ kfree(dev_meta);
+
return ret;
}
@@ -664,18 +660,21 @@
rq->cmd_flags &= ~REQ_FAILFAST_DRIVER;
if (rqd->bio)
- blk_init_request_from_bio(rq, rqd->bio);
+ blk_rq_append_bio(rq, &rqd->bio);
else
rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
return rq;
}
-static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
+static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd,
+ void *buf)
{
+ struct nvm_geo *geo = &dev->geo;
struct request_queue *q = dev->q;
struct nvme_nvm_command *cmd;
struct request *rq;
+ int ret;
cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL);
if (!cmd)
@@ -683,8 +682,15 @@
rq = nvme_nvm_alloc_request(q, rqd, cmd);
if (IS_ERR(rq)) {
- kfree(cmd);
- return PTR_ERR(rq);
+ ret = PTR_ERR(rq);
+ goto err_free_cmd;
+ }
+
+ if (buf) {
+ ret = blk_rq_map_kern(q, rq, buf, geo->csecs * rqd->nr_ppas,
+ GFP_KERNEL);
+ if (ret)
+ goto err_free_cmd;
}
rq->end_io_data = rqd;
@@ -692,41 +698,18 @@
blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io);
return 0;
-}
-static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
- struct request_queue *q = dev->q;
- struct request *rq;
- struct nvme_nvm_command cmd;
- int ret = 0;
-
- memset(&cmd, 0, sizeof(struct nvme_nvm_command));
-
- rq = nvme_nvm_alloc_request(q, rqd, &cmd);
- if (IS_ERR(rq))
- return PTR_ERR(rq);
-
- /* I/Os can fail and the error is signaled through rqd. Callers must
- * handle the error accordingly.
- */
- blk_execute_rq(q, NULL, rq, 0);
- if (nvme_req(rq)->flags & NVME_REQ_CANCELLED)
- ret = -EINTR;
-
- rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64);
- rqd->error = nvme_req(rq)->status;
-
- blk_mq_free_request(rq);
-
+err_free_cmd:
+ kfree(cmd);
return ret;
}
-static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
+static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name,
+ int size)
{
struct nvme_ns *ns = nvmdev->q->queuedata;
- return dma_pool_create(name, ns->ctrl->dev, PAGE_SIZE, PAGE_SIZE, 0);
+ return dma_pool_create(name, ns->ctrl->dev, size, PAGE_SIZE, 0);
}
static void nvme_nvm_destroy_dma_pool(void *pool)
@@ -757,7 +740,6 @@
.get_chk_meta = nvme_nvm_get_chk_meta,
.submit_io = nvme_nvm_submit_io,
- .submit_io_sync = nvme_nvm_submit_io_sync,
.create_dma_pool = nvme_nvm_create_dma_pool,
.destroy_dma_pool = nvme_nvm_destroy_dma_pool,
@@ -926,9 +908,9 @@
/* cdw11-12 */
c.ph_rw.length = cpu_to_le16(vcmd.nppas);
c.ph_rw.control = cpu_to_le16(vcmd.control);
- c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13);
- c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14);
- c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15);
+ c.common.cdw13 = cpu_to_le32(vcmd.cdw13);
+ c.common.cdw14 = cpu_to_le32(vcmd.cdw14);
+ c.common.cdw15 = cpu_to_le32(vcmd.cdw15);
if (vcmd.timeout_ms)
timeout = msecs_to_jiffies(vcmd.timeout_ms);
@@ -963,19 +945,11 @@
}
}
-void nvme_nvm_update_nvm_info(struct nvme_ns *ns)
-{
- struct nvm_dev *ndev = ns->ndev;
- struct nvm_geo *geo = &ndev->geo;
-
- geo->csecs = 1 << ns->lba_shift;
- geo->sos = ns->ms;
-}
-
int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
{
struct request_queue *q = ns->queue;
struct nvm_dev *dev;
+ struct nvm_geo *geo;
_nvme_nvm_check_size();
@@ -983,6 +957,13 @@
if (!dev)
return -ENOMEM;
+ /* Note that csecs and sos will be overridden if it is a 1.2 drive. */
+ geo = &dev->geo;
+ geo->csecs = 1 << ns->lba_shift;
+ geo->sos = ns->ms;
+ geo->ext = ns->ext;
+ geo->mdts = ns->ctrl->max_hw_sectors;
+
dev->q = q;
memcpy(dev->name, disk_name, DISK_NAME_LEN);
dev->ops = &nvme_nvm_dev_ops;
@@ -1190,42 +1171,6 @@
static NVM_DEV_ATTR_12_RO(media_capabilities);
static NVM_DEV_ATTR_12_RO(max_phys_secs);
-static struct attribute *nvm_dev_attrs_12[] = {
- &dev_attr_version.attr,
- &dev_attr_capabilities.attr,
-
- &dev_attr_vendor_opcode.attr,
- &dev_attr_device_mode.attr,
- &dev_attr_media_manager.attr,
- &dev_attr_ppa_format.attr,
- &dev_attr_media_type.attr,
- &dev_attr_flash_media_type.attr,
- &dev_attr_num_channels.attr,
- &dev_attr_num_luns.attr,
- &dev_attr_num_planes.attr,
- &dev_attr_num_blocks.attr,
- &dev_attr_num_pages.attr,
- &dev_attr_page_size.attr,
- &dev_attr_hw_sector_size.attr,
- &dev_attr_oob_sector_size.attr,
- &dev_attr_read_typ.attr,
- &dev_attr_read_max.attr,
- &dev_attr_prog_typ.attr,
- &dev_attr_prog_max.attr,
- &dev_attr_erase_typ.attr,
- &dev_attr_erase_max.attr,
- &dev_attr_multiplane_modes.attr,
- &dev_attr_media_capabilities.attr,
- &dev_attr_max_phys_secs.attr,
-
- NULL,
-};
-
-static const struct attribute_group nvm_dev_attr_group_12 = {
- .name = "lightnvm",
- .attrs = nvm_dev_attrs_12,
-};
-
/* 2.0 values */
static NVM_DEV_ATTR_20_RO(groups);
static NVM_DEV_ATTR_20_RO(punits);
@@ -1241,10 +1186,37 @@
static NVM_DEV_ATTR_20_RO(reset_typ);
static NVM_DEV_ATTR_20_RO(reset_max);
-static struct attribute *nvm_dev_attrs_20[] = {
+static struct attribute *nvm_dev_attrs[] = {
+ /* version agnostic attrs */
&dev_attr_version.attr,
&dev_attr_capabilities.attr,
+ &dev_attr_read_typ.attr,
+ &dev_attr_read_max.attr,
+ /* 1.2 attrs */
+ &dev_attr_vendor_opcode.attr,
+ &dev_attr_device_mode.attr,
+ &dev_attr_media_manager.attr,
+ &dev_attr_ppa_format.attr,
+ &dev_attr_media_type.attr,
+ &dev_attr_flash_media_type.attr,
+ &dev_attr_num_channels.attr,
+ &dev_attr_num_luns.attr,
+ &dev_attr_num_planes.attr,
+ &dev_attr_num_blocks.attr,
+ &dev_attr_num_pages.attr,
+ &dev_attr_page_size.attr,
+ &dev_attr_hw_sector_size.attr,
+ &dev_attr_oob_sector_size.attr,
+ &dev_attr_prog_typ.attr,
+ &dev_attr_prog_max.attr,
+ &dev_attr_erase_typ.attr,
+ &dev_attr_erase_max.attr,
+ &dev_attr_multiplane_modes.attr,
+ &dev_attr_media_capabilities.attr,
+ &dev_attr_max_phys_secs.attr,
+
+ /* 2.0 attrs */
&dev_attr_groups.attr,
&dev_attr_punits.attr,
&dev_attr_chunks.attr,
@@ -1255,8 +1227,6 @@
&dev_attr_maxocpu.attr,
&dev_attr_mw_cunits.attr,
- &dev_attr_read_typ.attr,
- &dev_attr_read_max.attr,
&dev_attr_write_typ.attr,
&dev_attr_write_max.attr,
&dev_attr_reset_typ.attr,
@@ -1265,44 +1235,38 @@
NULL,
};
-static const struct attribute_group nvm_dev_attr_group_20 = {
- .name = "lightnvm",
- .attrs = nvm_dev_attrs_20,
-};
-
-int nvme_nvm_register_sysfs(struct nvme_ns *ns)
+static umode_t nvm_dev_attrs_visible(struct kobject *kobj,
+ struct attribute *attr, int index)
{
+ struct device *dev = container_of(kobj, struct device, kobj);
+ struct gendisk *disk = dev_to_disk(dev);
+ struct nvme_ns *ns = disk->private_data;
struct nvm_dev *ndev = ns->ndev;
- struct nvm_geo *geo = &ndev->geo;
+ struct device_attribute *dev_attr =
+ container_of(attr, typeof(*dev_attr), attr);
if (!ndev)
- return -EINVAL;
+ return 0;
- switch (geo->major_ver_id) {
+ if (dev_attr->show == nvm_dev_attr_show)
+ return attr->mode;
+
+ switch (ndev->geo.major_ver_id) {
case 1:
- return sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
- &nvm_dev_attr_group_12);
- case 2:
- return sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
- &nvm_dev_attr_group_20);
- }
-
- return -EINVAL;
-}
-
-void nvme_nvm_unregister_sysfs(struct nvme_ns *ns)
-{
- struct nvm_dev *ndev = ns->ndev;
- struct nvm_geo *geo = &ndev->geo;
-
- switch (geo->major_ver_id) {
- case 1:
- sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
- &nvm_dev_attr_group_12);
+ if (dev_attr->show == nvm_dev_attr_show_12)
+ return attr->mode;
break;
case 2:
- sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
- &nvm_dev_attr_group_20);
+ if (dev_attr->show == nvm_dev_attr_show_20)
+ return attr->mode;
break;
}
+
+ return 0;
}
+
+const struct attribute_group nvme_nvm_attr_group = {
+ .name = "lightnvm",
+ .attrs = nvm_dev_attrs,
+ .is_visible = nvm_dev_attrs_visible,
+};
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index c27af27..e0f064d 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2017-2018 Christoph Hellwig.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
*/
#include <linux/moduleparam.h>
@@ -20,9 +12,34 @@
MODULE_PARM_DESC(multipath,
"turn on native support for multiple controllers per subsystem");
-inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
+void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
{
- return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
+ struct nvme_ns_head *h;
+
+ lockdep_assert_held(&subsys->lock);
+ list_for_each_entry(h, &subsys->nsheads, entry)
+ if (h->disk)
+ blk_mq_unfreeze_queue(h->disk->queue);
+}
+
+void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
+{
+ struct nvme_ns_head *h;
+
+ lockdep_assert_held(&subsys->lock);
+ list_for_each_entry(h, &subsys->nsheads, entry)
+ if (h->disk)
+ blk_mq_freeze_queue_wait(h->disk->queue);
+}
+
+void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
+{
+ struct nvme_ns_head *h;
+
+ lockdep_assert_held(&subsys->lock);
+ list_for_each_entry(h, &subsys->nsheads, entry)
+ if (h->disk)
+ blk_freeze_queue_start(h->disk->queue);
}
/*
@@ -39,7 +56,7 @@
sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
} else if (ns->head->disk) {
sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
- ctrl->cntlid, ns->head->instance);
+ ctrl->instance, ns->head->instance);
*flags = GENHD_FL_HIDDEN;
} else {
sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance,
@@ -117,29 +134,125 @@
[NVME_ANA_CHANGE] = "change",
};
-static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
+bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
{
- struct nvme_ns *ns, *fallback = NULL;
+ struct nvme_ns_head *head = ns->head;
+ bool changed = false;
+ int node;
+
+ if (!head)
+ goto out;
+
+ for_each_node(node) {
+ if (ns == rcu_access_pointer(head->current_path[node])) {
+ rcu_assign_pointer(head->current_path[node], NULL);
+ changed = true;
+ }
+ }
+out:
+ return changed;
+}
+
+void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
+{
+ struct nvme_ns *ns;
+
+ mutex_lock(&ctrl->scan_lock);
+ down_read(&ctrl->namespaces_rwsem);
+ list_for_each_entry(ns, &ctrl->namespaces, list)
+ if (nvme_mpath_clear_current_path(ns))
+ kblockd_schedule_work(&ns->head->requeue_work);
+ up_read(&ctrl->namespaces_rwsem);
+ mutex_unlock(&ctrl->scan_lock);
+}
+
+static bool nvme_path_is_disabled(struct nvme_ns *ns)
+{
+ return ns->ctrl->state != NVME_CTRL_LIVE ||
+ test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
+ test_bit(NVME_NS_REMOVING, &ns->flags);
+}
+
+static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
+{
+ int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
+ struct nvme_ns *found = NULL, *fallback = NULL, *ns;
list_for_each_entry_rcu(ns, &head->list, siblings) {
- if (ns->ctrl->state != NVME_CTRL_LIVE ||
- test_bit(NVME_NS_ANA_PENDING, &ns->flags))
+ if (nvme_path_is_disabled(ns))
continue;
+
+ if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+ distance = node_distance(node, ns->ctrl->numa_node);
+ else
+ distance = LOCAL_DISTANCE;
+
switch (ns->ana_state) {
case NVME_ANA_OPTIMIZED:
- rcu_assign_pointer(head->current_path, ns);
- return ns;
+ if (distance < found_distance) {
+ found_distance = distance;
+ found = ns;
+ }
+ break;
case NVME_ANA_NONOPTIMIZED:
- fallback = ns;
+ if (distance < fallback_distance) {
+ fallback_distance = distance;
+ fallback = ns;
+ }
break;
default:
break;
}
}
- if (fallback)
- rcu_assign_pointer(head->current_path, fallback);
- return fallback;
+ if (!found)
+ found = fallback;
+ if (found)
+ rcu_assign_pointer(head->current_path[node], found);
+ return found;
+}
+
+static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
+ struct nvme_ns *ns)
+{
+ ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
+ siblings);
+ if (ns)
+ return ns;
+ return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
+}
+
+static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
+ int node, struct nvme_ns *old)
+{
+ struct nvme_ns *ns, *found, *fallback = NULL;
+
+ if (list_is_singular(&head->list)) {
+ if (nvme_path_is_disabled(old))
+ return NULL;
+ return old;
+ }
+
+ for (ns = nvme_next_ns(head, old);
+ ns != old;
+ ns = nvme_next_ns(head, ns)) {
+ if (nvme_path_is_disabled(ns))
+ continue;
+
+ if (ns->ana_state == NVME_ANA_OPTIMIZED) {
+ found = ns;
+ goto out;
+ }
+ if (ns->ana_state == NVME_ANA_NONOPTIMIZED)
+ fallback = ns;
+ }
+
+ if (!fallback)
+ return NULL;
+ found = fallback;
+out:
+ rcu_assign_pointer(head->current_path[node], found);
+ return found;
}
static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
@@ -150,13 +263,35 @@
inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
{
- struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
+ int node = numa_node_id();
+ struct nvme_ns *ns;
+ ns = srcu_dereference(head->current_path[node], &head->srcu);
+ if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns)
+ ns = nvme_round_robin_path(head, node, ns);
if (unlikely(!ns || !nvme_path_is_optimized(ns)))
- ns = __nvme_find_path(head);
+ ns = __nvme_find_path(head, node);
return ns;
}
+static bool nvme_available_path(struct nvme_ns_head *head)
+{
+ struct nvme_ns *ns;
+
+ list_for_each_entry_rcu(ns, &head->list, siblings) {
+ switch (ns->ctrl->state) {
+ case NVME_CTRL_LIVE:
+ case NVME_CTRL_RESETTING:
+ case NVME_CTRL_CONNECTING:
+ /* fallthru */
+ return true;
+ default:
+ break;
+ }
+ }
+ return false;
+}
+
static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
struct bio *bio)
{
@@ -166,6 +301,14 @@
blk_qc_t ret = BLK_QC_T_NONE;
int srcu_idx;
+ /*
+ * The namespace might be going away and the bio might
+ * be moved to a different queue via blk_steal_bios(),
+ * so we need to use the bio_split pool from the original
+ * queue to allocate the bvecs from.
+ */
+ blk_queue_split(q, &bio);
+
srcu_idx = srcu_read_lock(&head->srcu);
ns = nvme_find_path(head);
if (likely(ns)) {
@@ -175,14 +318,14 @@
disk_devt(ns->head->disk),
bio->bi_iter.bi_sector);
ret = direct_make_request(bio);
- } else if (!list_empty_careful(&head->list)) {
- dev_warn_ratelimited(dev, "no path available - requeuing I/O\n");
+ } else if (nvme_available_path(head)) {
+ dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
spin_lock_irq(&head->requeue_lock);
bio_list_add(&head->requeue_list, bio);
spin_unlock_irq(&head->requeue_lock);
} else {
- dev_warn_ratelimited(dev, "no path - failing I/O\n");
+ dev_warn_ratelimited(dev, "no available path - failing I/O\n");
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
@@ -192,21 +335,6 @@
return ret;
}
-static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
-{
- struct nvme_ns_head *head = q->queuedata;
- struct nvme_ns *ns;
- bool found = false;
- int srcu_idx;
-
- srcu_idx = srcu_read_lock(&head->srcu);
- ns = srcu_dereference(head->current_path, &head->srcu);
- if (likely(ns && nvme_path_is_optimized(ns)))
- found = ns->queue->poll_fn(q, qc);
- srcu_read_unlock(&head->srcu, srcu_idx);
- return found;
-}
-
static void nvme_requeue_work(struct work_struct *work)
{
struct nvme_ns_head *head =
@@ -248,12 +376,11 @@
if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
return 0;
- q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
+ q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node);
if (!q)
goto out;
q->queuedata = head;
blk_queue_make_request(q, nvme_ns_head_make_request);
- q->poll_fn = nvme_ns_head_poll;
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
/* set to a default value for 512 until disk is validated */
blk_queue_logical_block_size(q, 512);
@@ -290,14 +417,20 @@
if (!head->disk)
return;
- if (!(head->disk->flags & GENHD_FL_UP)) {
- device_add_disk(&head->subsys->dev, head->disk);
- if (sysfs_create_group(&disk_to_dev(head->disk)->kobj,
- &nvme_ns_id_attr_group))
- dev_warn(&head->subsys->dev,
- "failed to create id group.\n");
+ if (!(head->disk->flags & GENHD_FL_UP))
+ device_add_disk(&head->subsys->dev, head->disk,
+ nvme_ns_id_attr_groups);
+
+ if (nvme_path_is_optimized(ns)) {
+ int node, srcu_idx;
+
+ srcu_idx = srcu_read_lock(&head->srcu);
+ for_each_node(node)
+ __nvme_find_path(head, node);
+ srcu_read_unlock(&head->srcu, srcu_idx);
}
+ synchronize_srcu(&ns->head->srcu);
kblockd_schedule_work(&ns->head->requeue_work);
}
@@ -349,15 +482,12 @@
static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
struct nvme_ns *ns)
{
- enum nvme_ana_state old;
-
mutex_lock(&ns->head->lock);
- old = ns->ana_state;
ns->ana_grpid = le32_to_cpu(desc->grpid);
ns->ana_state = desc->state;
clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
- if (nvme_state_is_live(ns->ana_state) && !nvme_state_is_live(old))
+ if (nvme_state_is_live(ns->ana_state))
nvme_mpath_set_live(ns);
mutex_unlock(&ns->head->lock);
}
@@ -369,7 +499,7 @@
unsigned *nr_change_groups = data;
struct nvme_ns *ns;
- dev_info(ctrl->device, "ANA group %d: %s.\n",
+ dev_dbg(ctrl->device, "ANA group %d: %s.\n",
le32_to_cpu(desc->grpid),
nvme_ana_state_names[desc->state]);
@@ -381,25 +511,26 @@
down_write(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
- if (ns->head->ns_id != le32_to_cpu(desc->nsids[n]))
+ unsigned nsid = le32_to_cpu(desc->nsids[n]);
+
+ if (ns->head->ns_id < nsid)
continue;
- nvme_update_ns_ana_state(desc, ns);
+ if (ns->head->ns_id == nsid)
+ nvme_update_ns_ana_state(desc, ns);
if (++n == nr_nsids)
break;
}
up_write(&ctrl->namespaces_rwsem);
- WARN_ON_ONCE(n < nr_nsids);
return 0;
}
-static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only)
+static int nvme_read_ana_log(struct nvme_ctrl *ctrl)
{
u32 nr_change_groups = 0;
int error;
mutex_lock(&ctrl->ana_lock);
- error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA,
- groups_only ? NVME_ANA_LOG_RGO : 0,
+ error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0,
ctrl->ana_log_buf, ctrl->ana_log_size, 0);
if (error) {
dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
@@ -435,7 +566,7 @@
{
struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
- nvme_read_ana_log(ctrl, false);
+ nvme_read_ana_log(ctrl);
}
static void nvme_anatt_timeout(struct timer_list *t)
@@ -454,6 +585,44 @@
cancel_work_sync(&ctrl->ana_work);
}
+#define SUBSYS_ATTR_RW(_name, _mode, _show, _store) \
+ struct device_attribute subsys_attr_##_name = \
+ __ATTR(_name, _mode, _show, _store)
+
+static const char *nvme_iopolicy_names[] = {
+ [NVME_IOPOLICY_NUMA] = "numa",
+ [NVME_IOPOLICY_RR] = "round-robin",
+};
+
+static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_subsystem *subsys =
+ container_of(dev, struct nvme_subsystem, dev);
+
+ return sprintf(buf, "%s\n",
+ nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
+}
+
+static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct nvme_subsystem *subsys =
+ container_of(dev, struct nvme_subsystem, dev);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
+ if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
+ WRITE_ONCE(subsys->iopolicy, i);
+ return count;
+ }
+ }
+
+ return -EINVAL;
+}
+SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
+ nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
+
static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
@@ -502,11 +671,8 @@
{
if (!head->disk)
return;
- if (head->disk->flags & GENHD_FL_UP) {
- sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
- &nvme_ns_id_attr_group);
+ if (head->disk->flags & GENHD_FL_UP)
del_gendisk(head->disk);
- }
blk_set_queue_dying(head->disk->queue);
/* make sure all pending bios are cleaned up */
kblockd_schedule_work(&head->requeue_work);
@@ -519,7 +685,8 @@
{
int error;
- if (!nvme_ctrl_use_ana(ctrl))
+ /* check if multipath is enabled and we have the capability */
+ if (!multipath || !ctrl->subsys || !(ctrl->subsys->cmic & (1 << 3)))
return 0;
ctrl->anacap = id->anacap;
@@ -531,8 +698,7 @@
timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
- if (!(ctrl->anacap & (1 << 6)))
- ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
+ ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
dev_err(ctrl->device,
@@ -550,12 +716,13 @@
goto out;
}
- error = nvme_read_ana_log(ctrl, true);
+ error = nvme_read_ana_log(ctrl);
if (error)
goto out_free_ana_log_buf;
return 0;
out_free_ana_log_buf:
kfree(ctrl->ana_log_buf);
+ ctrl->ana_log_buf = NULL;
out:
return error;
}
@@ -563,5 +730,6 @@
void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
{
kfree(ctrl->ana_log_buf);
+ ctrl->ana_log_buf = NULL;
}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 60220de..22e8401 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -1,14 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2011-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
*/
#ifndef _NVME_H
@@ -23,6 +15,9 @@
#include <linux/sed-opal.h>
#include <linux/fault-inject.h>
#include <linux/rcupdate.h>
+#include <linux/wait.h>
+
+#include <trace/events/block.h>
extern unsigned int nvme_io_timeout;
#define NVME_IO_TIMEOUT (nvme_io_timeout * HZ)
@@ -90,6 +85,36 @@
* Set MEDIUM priority on SQ creation
*/
NVME_QUIRK_MEDIUM_PRIO_SQ = (1 << 7),
+
+ /*
+ * Ignore device provided subnqn.
+ */
+ NVME_QUIRK_IGNORE_DEV_SUBNQN = (1 << 8),
+
+ /*
+ * Broken Write Zeroes.
+ */
+ NVME_QUIRK_DISABLE_WRITE_ZEROES = (1 << 9),
+
+ /*
+ * Force simple suspend/resume path.
+ */
+ NVME_QUIRK_SIMPLE_SUSPEND = (1 << 10),
+
+ /*
+ * Use only one interrupt vector for all queues
+ */
+ NVME_QUIRK_SINGLE_VECTOR = (1 << 11),
+
+ /*
+ * Use non-standard 128 bytes SQEs.
+ */
+ NVME_QUIRK_128_BYTES_SQES = (1 << 12),
+
+ /*
+ * Prevent tag overlap between queues
+ */
+ NVME_QUIRK_SHARED_TAGS = (1 << 13),
};
/*
@@ -137,22 +162,34 @@
enum nvme_ctrl_state {
NVME_CTRL_NEW,
NVME_CTRL_LIVE,
- NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */
NVME_CTRL_RESETTING,
NVME_CTRL_CONNECTING,
NVME_CTRL_DELETING,
NVME_CTRL_DEAD,
};
+struct nvme_fault_inject {
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+ struct fault_attr attr;
+ struct dentry *parent;
+ bool dont_retry; /* DNR, do not retry */
+ u16 status; /* status code */
+#endif
+};
+
struct nvme_ctrl {
+ bool comp_seen;
enum nvme_ctrl_state state;
bool identified;
spinlock_t lock;
+ struct mutex scan_lock;
const struct nvme_ctrl_ops *ops;
struct request_queue *admin_q;
struct request_queue *connect_q;
+ struct request_queue *fabrics_q;
struct device *dev;
int instance;
+ int numa_node;
struct blk_mq_tag_set *tagset;
struct blk_mq_tag_set *admin_tagset;
struct list_head namespaces;
@@ -162,6 +199,7 @@
struct cdev cdev;
struct work_struct reset_work;
struct work_struct delete_work;
+ wait_queue_head_t state_wq;
struct nvme_subsystem *subsys;
struct list_head subsys_entry;
@@ -179,10 +217,12 @@
u32 page_size;
u32 max_hw_sectors;
u32 max_segments;
+ u16 crdt[3];
u16 oncs;
u16 oacs;
u16 nssa;
u16 nr_streams;
+ u16 sqsize;
u32 max_namespaces;
atomic_t abort_limit;
u8 vwc;
@@ -193,6 +233,7 @@
u8 apsta;
u32 oaes;
u32 aen_result;
+ u32 ctratt;
unsigned int shutdown_timeout;
unsigned int kato;
bool subsystem;
@@ -230,13 +271,22 @@
u16 hmmaxd;
/* Fabrics only */
- u16 sqsize;
u32 ioccsz;
u32 iorcsz;
u16 icdoff;
u16 maxcmd;
int nr_reconnects;
struct nvmf_ctrl_options *opts;
+
+ struct page *discard_page;
+ unsigned long discard_page_busy;
+
+ struct nvme_fault_inject fault_inject;
+};
+
+enum nvme_iopolicy {
+ NVME_IOPOLICY_NUMA,
+ NVME_IOPOLICY_RR,
};
struct nvme_subsystem {
@@ -257,7 +307,11 @@
char firmware_rev[8];
u8 cmic;
u16 vendor_id;
+ u16 awupf; /* 0's based awupf value. */
struct ida ns_ida;
+#ifdef CONFIG_NVME_MULTIPATH
+ enum nvme_iopolicy iopolicy;
+#endif
};
/*
@@ -277,14 +331,6 @@
* only ever has a single entry for private namespaces.
*/
struct nvme_ns_head {
-#ifdef CONFIG_NVME_MULTIPATH
- struct gendisk *disk;
- struct nvme_ns __rcu *current_path;
- struct bio_list requeue_list;
- spinlock_t requeue_lock;
- struct work_struct requeue_work;
- struct mutex lock;
-#endif
struct list_head list;
struct srcu_struct srcu;
struct nvme_subsystem *subsys;
@@ -293,16 +339,15 @@
struct list_head entry;
struct kref ref;
int instance;
-};
-
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-struct nvme_fault_inject {
- struct fault_attr attr;
- struct dentry *parent;
- bool dont_retry; /* DNR, do not retry */
- u16 status; /* status code */
-};
+#ifdef CONFIG_NVME_MULTIPATH
+ struct gendisk *disk;
+ struct bio_list requeue_list;
+ spinlock_t requeue_lock;
+ struct work_struct requeue_work;
+ struct mutex lock;
+ struct nvme_ns __rcu *current_path[];
#endif
+};
struct nvme_ns {
struct list_head list;
@@ -331,9 +376,7 @@
#define NVME_NS_ANA_PENDING 2
u16 noiob;
-#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
struct nvme_fault_inject fault_inject;
-#endif
};
@@ -343,6 +386,7 @@
unsigned int flags;
#define NVME_F_FABRICS (1 << 0)
#define NVME_F_METADATA_SUPPORTED (1 << 1)
+#define NVME_F_PCI_P2PDMA (1 << 2)
int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
@@ -350,28 +394,24 @@
void (*submit_async_event)(struct nvme_ctrl *ctrl);
void (*delete_ctrl)(struct nvme_ctrl *ctrl);
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
- void (*stop_ctrl)(struct nvme_ctrl *ctrl);
};
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
-void nvme_fault_inject_init(struct nvme_ns *ns);
-void nvme_fault_inject_fini(struct nvme_ns *ns);
+void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj,
+ const char *dev_name);
+void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inject);
void nvme_should_fail(struct request *req);
#else
-static inline void nvme_fault_inject_init(struct nvme_ns *ns) {}
-static inline void nvme_fault_inject_fini(struct nvme_ns *ns) {}
+static inline void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj,
+ const char *dev_name)
+{
+}
+static inline void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inj)
+{
+}
static inline void nvme_should_fail(struct request *req) {}
#endif
-static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
-{
- u32 val = 0;
-
- if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val))
- return false;
- return val & NVME_CSTS_RDY;
-}
-
static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
{
if (!ctrl->subsystem)
@@ -407,11 +447,12 @@
}
void nvme_complete_rq(struct request *req);
-void nvme_cancel_request(struct request *req, void *data, bool reserved);
+bool nvme_cancel_request(struct request *req, void *data, bool reserved);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state);
-int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
-int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
+bool nvme_wait_reset(struct nvme_ctrl *ctrl);
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
+int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
const struct nvme_ctrl_ops *ops, unsigned long quirks);
@@ -432,6 +473,7 @@
void nvme_stop_queues(struct nvme_ctrl *ctrl);
void nvme_start_queues(struct nvme_ctrl *ctrl);
void nvme_kill_queues(struct nvme_ctrl *ctrl);
+void nvme_sync_queues(struct nvme_ctrl *ctrl);
void nvme_unfreeze(struct nvme_ctrl *ctrl);
void nvme_wait_freeze(struct nvme_ctrl *ctrl);
void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
@@ -448,22 +490,35 @@
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
unsigned timeout, int qid, int at_head,
- blk_mq_req_flags_t flags);
+ blk_mq_req_flags_t flags, bool poll);
+int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid,
+ unsigned int dword11, void *buffer, size_t buflen,
+ u32 *result);
+int nvme_get_features(struct nvme_ctrl *dev, unsigned int fid,
+ unsigned int dword11, void *buffer, size_t buflen,
+ u32 *result);
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
+int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
-int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
void *log, size_t size, u64 offset);
-extern const struct attribute_group nvme_ns_id_attr_group;
+extern const struct attribute_group *nvme_ns_id_attr_groups[];
extern const struct block_device_operations nvme_ns_head_ops;
#ifdef CONFIG_NVME_MULTIPATH
-bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
+static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
+{
+ return ctrl->ana_log_buf != NULL;
+}
+
+void nvme_mpath_unfreeze(struct nvme_subsystem *subsys);
+void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
+void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
struct nvme_ctrl *ctrl, int *flags);
void nvme_failover_req(struct request *req);
@@ -474,14 +529,8 @@
int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
void nvme_mpath_stop(struct nvme_ctrl *ctrl);
-
-static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
-{
- struct nvme_ns_head *head = ns->head;
-
- if (head && ns == rcu_access_pointer(head->current_path))
- rcu_assign_pointer(head->current_path, NULL);
-}
+bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
+void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
@@ -492,8 +541,19 @@
kblockd_schedule_work(&head->requeue_work);
}
+static inline void nvme_trace_bio_complete(struct request *req,
+ blk_status_t status)
+{
+ struct nvme_ns *ns = req->q->queuedata;
+
+ if (req->cmd_flags & REQ_NVME_MPATH)
+ trace_block_bio_complete(ns->head->disk->queue,
+ req->bio, status);
+}
+
extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
+extern struct device_attribute subsys_attr_iopolicy;
#else
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
@@ -528,12 +588,20 @@
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
}
-static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
+static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
+{
+ return false;
+}
+static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
{
}
static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
{
}
+static inline void nvme_trace_bio_complete(struct request *req,
+ blk_status_t status)
+{
+}
static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
struct nvme_id_ctrl *id)
{
@@ -548,17 +616,23 @@
static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl)
{
}
+static inline void nvme_mpath_unfreeze(struct nvme_subsystem *subsys)
+{
+}
+static inline void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
+{
+}
+static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
+{
+}
#endif /* CONFIG_NVME_MULTIPATH */
#ifdef CONFIG_NVM
-void nvme_nvm_update_nvm_info(struct nvme_ns *ns);
int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
void nvme_nvm_unregister(struct nvme_ns *ns);
-int nvme_nvm_register_sysfs(struct nvme_ns *ns);
-void nvme_nvm_unregister_sysfs(struct nvme_ns *ns);
+extern const struct attribute_group nvme_nvm_attr_group;
int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg);
#else
-static inline void nvme_nvm_update_nvm_info(struct nvme_ns *ns) {};
static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
int node)
{
@@ -566,11 +640,6 @@
}
static inline void nvme_nvm_unregister(struct nvme_ns *ns) {};
-static inline int nvme_nvm_register_sysfs(struct nvme_ns *ns)
-{
- return 0;
-}
-static inline void nvme_nvm_unregister_sysfs(struct nvme_ns *ns) {};
static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd,
unsigned long arg)
{
@@ -583,7 +652,4 @@
return dev_to_disk(dev)->private_data;
}
-int __init nvme_core_init(void);
-void nvme_core_exit(void);
-
#endif /* _NVME_H */
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d668682..869f462 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* NVM Express device driver
* Copyright (c) 2011-2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
*/
#include <linux/aer.h>
@@ -26,15 +18,18 @@
#include <linux/mutex.h>
#include <linux/once.h>
#include <linux/pci.h>
+#include <linux/suspend.h>
#include <linux/t10-pi.h>
#include <linux/types.h>
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/sed-opal.h>
+#include <linux/pci-p2pdma.h>
+#include "trace.h"
#include "nvme.h"
-#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
-#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
+#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes)
+#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
@@ -73,10 +68,21 @@
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
+static int write_queues;
+module_param(write_queues, int, 0644);
+MODULE_PARM_DESC(write_queues,
+ "Number of queues to use for writes. If not set, reads and writes "
+ "will share a queue set.");
+
+static int poll_queues;
+module_param(poll_queues, int, 0644);
+MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
+
struct nvme_dev;
struct nvme_queue;
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
+static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
@@ -91,21 +97,22 @@
struct dma_pool *prp_small_pool;
unsigned online_queues;
unsigned max_qid;
+ unsigned io_queues[HCTX_MAX_TYPES];
unsigned int num_vecs;
int q_depth;
+ int io_sqes;
u32 db_stride;
void __iomem *bar;
unsigned long bar_mapped_size;
struct work_struct remove_work;
struct mutex shutdown_lock;
bool subsystem;
- void __iomem *cmb;
- pci_bus_addr_t cmb_bus_addr;
u64 cmb_size;
+ bool cmb_use_sqes;
u32 cmbsz;
u32 cmbloc;
struct nvme_ctrl ctrl;
- struct completion ioq_wait;
+ u32 last_ps;
mempool_t *iod_mempool;
@@ -154,35 +161,42 @@
* commands and one for I/O commands).
*/
struct nvme_queue {
- struct device *q_dmadev;
struct nvme_dev *dev;
spinlock_t sq_lock;
- struct nvme_command *sq_cmds;
- struct nvme_command __iomem *sq_cmds_io;
- spinlock_t cq_lock ____cacheline_aligned_in_smp;
+ void *sq_cmds;
+ /* only used for poll queues: */
+ spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
volatile struct nvme_completion *cqes;
struct blk_mq_tags **tags;
dma_addr_t sq_dma_addr;
dma_addr_t cq_dma_addr;
u32 __iomem *q_db;
u16 q_depth;
- s16 cq_vector;
+ u16 cq_vector;
u16 sq_tail;
+ u16 last_sq_tail;
u16 cq_head;
u16 last_cq_head;
u16 qid;
u8 cq_phase;
+ u8 sqes;
+ unsigned long flags;
+#define NVMEQ_ENABLED 0
+#define NVMEQ_SQ_CMB 1
+#define NVMEQ_DELETE_ERROR 2
+#define NVMEQ_POLLED 3
u32 *dbbuf_sq_db;
u32 *dbbuf_cq_db;
u32 *dbbuf_sq_ei;
u32 *dbbuf_cq_ei;
+ struct completion delete_done;
};
/*
- * The nvme_iod describes the data in an I/O, including the list of PRP
- * entries. You can't see it in this data structure because C doesn't let
- * me express that. Use nvme_init_iod to ensure there's enough space
- * allocated to store the PRP list.
+ * The nvme_iod describes the data in an I/O.
+ *
+ * The sg pointer contains the list of PRP/SGL chunk allocations in addition
+ * to the actual struct scatterlist.
*/
struct nvme_iod {
struct nvme_request req;
@@ -191,36 +205,26 @@
int aborted;
int npages; /* In the PRP list. 0 means small pool in use */
int nents; /* Used in scatterlist */
- int length; /* Of data, in bytes */
dma_addr_t first_dma;
- struct scatterlist meta_sg; /* metadata requires single contiguous buffer */
+ unsigned int dma_len; /* length of single DMA segment mapping */
+ dma_addr_t meta_dma;
struct scatterlist *sg;
- struct scatterlist inline_sg[0];
};
-/*
- * Check we didin't inadvertently grow the command struct
- */
-static inline void _nvme_check_size(void)
+static unsigned int max_io_queues(void)
{
- BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
- BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
- BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
- BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
+ return num_possible_cpus() + write_queues + poll_queues;
+}
+
+static unsigned int max_queue_count(void)
+{
+ /* IO queues + admin queue */
+ return 1 + max_io_queues();
}
static inline unsigned int nvme_dbbuf_size(u32 stride)
{
- return ((num_possible_cpus() + 1) * 8 * stride);
+ return (max_queue_count() * 8 * stride);
}
static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
@@ -332,12 +336,6 @@
}
/*
- * Max size of iod being embedded in the request payload
- */
-#define NVME_INT_PAGES 2
-#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->ctrl.page_size)
-
-/*
* Will slightly overestimate the number of pages needed. This is OK
* as it only leads to a small amount of wasted memory for the lifetime of
* the I/O.
@@ -371,15 +369,6 @@
return alloc_size + sizeof(struct scatterlist) * nseg;
}
-static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl)
-{
- unsigned int alloc_size = nvme_pci_iod_alloc_size(dev,
- NVME_INT_BYTES(dev), NVME_INT_PAGES,
- use_sgl);
-
- return sizeof(struct nvme_iod) + alloc_size;
-}
-
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
@@ -431,33 +420,91 @@
return 0;
}
+static int queue_irq_offset(struct nvme_dev *dev)
+{
+ /* if we have more than 1 vec, admin queue offsets us by 1 */
+ if (dev->num_vecs > 1)
+ return 1;
+
+ return 0;
+}
+
static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
struct nvme_dev *dev = set->driver_data;
+ int i, qoff, offset;
- return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev),
- dev->num_vecs > 1 ? 1 /* admin queue */ : 0);
+ offset = queue_irq_offset(dev);
+ for (i = 0, qoff = 0; i < set->nr_maps; i++) {
+ struct blk_mq_queue_map *map = &set->map[i];
+
+ map->nr_queues = dev->io_queues[i];
+ if (!map->nr_queues) {
+ BUG_ON(i == HCTX_TYPE_DEFAULT);
+ continue;
+ }
+
+ /*
+ * The poll queue(s) doesn't have an IRQ (and hence IRQ
+ * affinity), so use the regular blk-mq cpu mapping
+ */
+ map->queue_offset = qoff;
+ if (i != HCTX_TYPE_POLL && offset)
+ blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
+ else
+ blk_mq_map_queues(map);
+ qoff += map->nr_queues;
+ offset += map->nr_queues;
+ }
+
+ return 0;
+}
+
+/*
+ * Write sq tail if we are asked to, or if the next command would wrap.
+ */
+static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
+{
+ if (!write_sq) {
+ u16 next_tail = nvmeq->sq_tail + 1;
+
+ if (next_tail == nvmeq->q_depth)
+ next_tail = 0;
+ if (next_tail != nvmeq->last_sq_tail)
+ return;
+ }
+
+ if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
+ nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
+ writel(nvmeq->sq_tail, nvmeq->q_db);
+ nvmeq->last_sq_tail = nvmeq->sq_tail;
}
/**
* nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
* @nvmeq: The queue to use
* @cmd: The command to send
+ * @write_sq: whether to write to the SQ doorbell
*/
-static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
+static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
+ bool write_sq)
{
spin_lock(&nvmeq->sq_lock);
- if (nvmeq->sq_cmds_io)
- memcpy_toio(&nvmeq->sq_cmds_io[nvmeq->sq_tail], cmd,
- sizeof(*cmd));
- else
- memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
-
+ memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
+ cmd, sizeof(*cmd));
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
- if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
- nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
- writel(nvmeq->sq_tail, nvmeq->q_db);
+ nvme_write_sq_db(nvmeq, write_sq);
+ spin_unlock(&nvmeq->sq_lock);
+}
+
+static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+ struct nvme_queue *nvmeq = hctx->driver_data;
+
+ spin_lock(&nvmeq->sq_lock);
+ if (nvmeq->sq_tail != nvmeq->last_sq_tail)
+ nvme_write_sq_db(nvmeq, true);
spin_unlock(&nvmeq->sq_lock);
}
@@ -487,38 +534,28 @@
return true;
}
-static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
- int nseg = blk_rq_nr_phys_segments(rq);
- unsigned int size = blk_rq_payload_bytes(rq);
-
- iod->use_sgl = nvme_pci_use_sgls(dev, rq);
-
- if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
- iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
- if (!iod->sg)
- return BLK_STS_RESOURCE;
- } else {
- iod->sg = iod->inline_sg;
- }
-
- iod->aborted = 0;
- iod->npages = -1;
- iod->nents = 0;
- iod->length = size;
-
- return BLK_STS_OK;
-}
-
-static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
+static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
-
int i;
+ if (iod->dma_len) {
+ dma_unmap_page(dev->dev, dma_addr, iod->dma_len,
+ rq_dma_dir(req));
+ return;
+ }
+
+ WARN_ON_ONCE(!iod->nents);
+
+ if (is_pci_p2pdma_page(sg_page(iod->sg)))
+ pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
+ rq_dma_dir(req));
+ else
+ dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
+
+
if (iod->npages == 0)
dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
dma_addr);
@@ -541,8 +578,7 @@
dma_addr = next_dma_addr;
}
- if (iod->sg != iod->inline_sg)
- mempool_free(iod->sg, dev->iod_mempool);
+ mempool_free(iod->sg, dev->iod_mempool);
}
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
@@ -732,72 +768,105 @@
return BLK_STS_OK;
}
+static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
+ struct request *req, struct nvme_rw_command *cmnd,
+ struct bio_vec *bv)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1);
+ unsigned int first_prp_len = dev->ctrl.page_size - offset;
+
+ iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
+ if (dma_mapping_error(dev->dev, iod->first_dma))
+ return BLK_STS_RESOURCE;
+ iod->dma_len = bv->bv_len;
+
+ cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
+ if (bv->bv_len > first_prp_len)
+ cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
+ return 0;
+}
+
+static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
+ struct request *req, struct nvme_rw_command *cmnd,
+ struct bio_vec *bv)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+ iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
+ if (dma_mapping_error(dev->dev, iod->first_dma))
+ return BLK_STS_RESOURCE;
+ iod->dma_len = bv->bv_len;
+
+ cmnd->flags = NVME_CMD_SGL_METABUF;
+ cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
+ cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
+ cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
+ return 0;
+}
+
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct nvme_command *cmnd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct request_queue *q = req->q;
- enum dma_data_direction dma_dir = rq_data_dir(req) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE;
- blk_status_t ret = BLK_STS_IOERR;
+ blk_status_t ret = BLK_STS_RESOURCE;
int nr_mapped;
+ if (blk_rq_nr_phys_segments(req) == 1) {
+ struct bio_vec bv = req_bvec(req);
+
+ if (!is_pci_p2pdma_page(bv.bv_page)) {
+ if (bv.bv_offset + bv.bv_len <= dev->ctrl.page_size * 2)
+ return nvme_setup_prp_simple(dev, req,
+ &cmnd->rw, &bv);
+
+ if (iod->nvmeq->qid &&
+ dev->ctrl.sgls & ((1 << 0) | (1 << 1)))
+ return nvme_setup_sgl_simple(dev, req,
+ &cmnd->rw, &bv);
+ }
+ }
+
+ iod->dma_len = 0;
+ iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
+ if (!iod->sg)
+ return BLK_STS_RESOURCE;
sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
- iod->nents = blk_rq_map_sg(q, req, iod->sg);
+ iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
if (!iod->nents)
goto out;
- ret = BLK_STS_RESOURCE;
- nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
- DMA_ATTR_NO_WARN);
+ if (is_pci_p2pdma_page(sg_page(iod->sg)))
+ nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg,
+ iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN);
+ else
+ nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
+ rq_dma_dir(req), DMA_ATTR_NO_WARN);
if (!nr_mapped)
goto out;
+ iod->use_sgl = nvme_pci_use_sgls(dev, req);
if (iod->use_sgl)
ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
else
ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
-
- if (ret != BLK_STS_OK)
- goto out_unmap;
-
- ret = BLK_STS_IOERR;
- if (blk_integrity_rq(req)) {
- if (blk_rq_count_integrity_sg(q, req->bio) != 1)
- goto out_unmap;
-
- sg_init_table(&iod->meta_sg, 1);
- if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
- goto out_unmap;
-
- if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
- goto out_unmap;
- }
-
- if (blk_integrity_rq(req))
- cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
- return BLK_STS_OK;
-
-out_unmap:
- dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
out:
+ if (ret != BLK_STS_OK)
+ nvme_unmap_data(dev, req);
return ret;
}
-static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
+static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
+ struct nvme_command *cmnd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- enum dma_data_direction dma_dir = rq_data_dir(req) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE;
- if (iod->nents) {
- dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
- if (blk_integrity_rq(req))
- dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
- }
-
- nvme_cleanup_cmd(req);
- nvme_free_iod(dev, req);
+ iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
+ rq_dma_dir(req), 0);
+ if (dma_mapping_error(dev->dev, iod->meta_dma))
+ return BLK_STS_IOERR;
+ cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
+ return 0;
}
/*
@@ -810,35 +879,42 @@
struct nvme_queue *nvmeq = hctx->driver_data;
struct nvme_dev *dev = nvmeq->dev;
struct request *req = bd->rq;
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_command cmnd;
blk_status_t ret;
+ iod->aborted = 0;
+ iod->npages = -1;
+ iod->nents = 0;
+
/*
* We should not need to do this, but we're still using this to
* ensure we can drain requests on a dying queue.
*/
- if (unlikely(nvmeq->cq_vector < 0))
+ if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
return BLK_STS_IOERR;
ret = nvme_setup_cmd(ns, req, &cmnd);
if (ret)
return ret;
- ret = nvme_init_iod(req, dev);
- if (ret)
- goto out_free_cmd;
-
if (blk_rq_nr_phys_segments(req)) {
ret = nvme_map_data(dev, req, &cmnd);
if (ret)
- goto out_cleanup_iod;
+ goto out_free_cmd;
+ }
+
+ if (blk_integrity_rq(req)) {
+ ret = nvme_map_metadata(dev, req, &cmnd);
+ if (ret)
+ goto out_unmap_data;
}
blk_mq_start_request(req);
- nvme_submit_cmd(nvmeq, &cmnd);
+ nvme_submit_cmd(nvmeq, &cmnd, bd->last);
return BLK_STS_OK;
-out_cleanup_iod:
- nvme_free_iod(dev, req);
+out_unmap_data:
+ nvme_unmap_data(dev, req);
out_free_cmd:
nvme_cleanup_cmd(req);
return ret;
@@ -847,8 +923,14 @@
static void nvme_pci_complete_rq(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_dev *dev = iod->nvmeq->dev;
- nvme_unmap_data(iod->nvmeq->dev, req);
+ nvme_cleanup_cmd(req);
+ if (blk_integrity_rq(req))
+ dma_unmap_page(dev->dev, iod->meta_dma,
+ rq_integrity_vec(req)->bv_len, rq_data_dir(req));
+ if (blk_rq_nr_phys_segments(req))
+ nvme_unmap_data(dev, req);
nvme_complete_rq(req);
}
@@ -894,6 +976,7 @@
}
req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
+ trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
nvme_end_request(req, cqe->status, cqe->result);
}
@@ -908,21 +991,23 @@
static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
{
- if (++nvmeq->cq_head == nvmeq->q_depth) {
+ if (nvmeq->cq_head == nvmeq->q_depth - 1) {
nvmeq->cq_head = 0;
nvmeq->cq_phase = !nvmeq->cq_phase;
+ } else {
+ nvmeq->cq_head++;
}
}
-static inline bool nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
- u16 *end, int tag)
+static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
+ u16 *end, unsigned int tag)
{
- bool found = false;
+ int found = 0;
*start = nvmeq->cq_head;
- while (!found && nvme_cqe_pending(nvmeq)) {
- if (nvmeq->cqes[nvmeq->cq_head].command_id == tag)
- found = true;
+ while (nvme_cqe_pending(nvmeq)) {
+ if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag)
+ found++;
nvme_update_cq_head(nvmeq);
}
*end = nvmeq->cq_head;
@@ -938,12 +1023,16 @@
irqreturn_t ret = IRQ_NONE;
u16 start, end;
- spin_lock(&nvmeq->cq_lock);
+ /*
+ * The rmb/wmb pair ensures we see all updates from a previous run of
+ * the irq handler, even if that was on another CPU.
+ */
+ rmb();
if (nvmeq->cq_head != nvmeq->last_cq_head)
ret = IRQ_HANDLED;
nvme_process_cq(nvmeq, &start, &end, -1);
nvmeq->last_cq_head = nvmeq->cq_head;
- spin_unlock(&nvmeq->cq_lock);
+ wmb();
if (start != end) {
nvme_complete_cqes(nvmeq, start, end);
@@ -961,29 +1050,52 @@
return IRQ_NONE;
}
-static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
+/*
+ * Poll for completions any queue, including those not dedicated to polling.
+ * Can be called from any context.
+ */
+static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
{
+ struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
+ u16 start, end;
+ int found;
+
+ /*
+ * For a poll queue we need to protect against the polling thread
+ * using the CQ lock. For normal interrupt driven threads we have
+ * to disable the interrupt to avoid racing with it.
+ */
+ if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) {
+ spin_lock(&nvmeq->cq_poll_lock);
+ found = nvme_process_cq(nvmeq, &start, &end, tag);
+ spin_unlock(&nvmeq->cq_poll_lock);
+ } else {
+ disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
+ found = nvme_process_cq(nvmeq, &start, &end, tag);
+ enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
+ }
+
+ nvme_complete_cqes(nvmeq, start, end);
+ return found;
+}
+
+static int nvme_poll(struct blk_mq_hw_ctx *hctx)
+{
+ struct nvme_queue *nvmeq = hctx->driver_data;
u16 start, end;
bool found;
if (!nvme_cqe_pending(nvmeq))
return 0;
- spin_lock_irq(&nvmeq->cq_lock);
- found = nvme_process_cq(nvmeq, &start, &end, tag);
- spin_unlock_irq(&nvmeq->cq_lock);
+ spin_lock(&nvmeq->cq_poll_lock);
+ found = nvme_process_cq(nvmeq, &start, &end, -1);
+ spin_unlock(&nvmeq->cq_poll_lock);
nvme_complete_cqes(nvmeq, start, end);
return found;
}
-static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
-{
- struct nvme_queue *nvmeq = hctx->driver_data;
-
- return __nvme_poll(nvmeq, tag);
-}
-
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
@@ -993,7 +1105,7 @@
memset(&c, 0, sizeof(c));
c.common.opcode = nvme_admin_async_event;
c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
- nvme_submit_cmd(nvmeq, &c);
+ nvme_submit_cmd(nvmeq, &c, true);
}
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@@ -1011,7 +1123,10 @@
struct nvme_queue *nvmeq, s16 vector)
{
struct nvme_command c;
- int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
+ int flags = NVME_QUEUE_PHYS_CONTIG;
+
+ if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
+ flags |= NVME_CQ_IRQ_ENABLED;
/*
* Note: we (ab)use the fact that the prp fields survive if no data
@@ -1152,7 +1267,7 @@
/*
* Did we miss an interrupt?
*/
- if (__nvme_poll(nvmeq, req->tag)) {
+ if (nvme_poll_irqdisable(nvmeq, req->tag)) {
dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, completion polled\n",
req->tag, nvmeq->qid);
@@ -1167,13 +1282,17 @@
*/
switch (dev->ctrl.state) {
case NVME_CTRL_CONNECTING:
- case NVME_CTRL_RESETTING:
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+ /* fall through */
+ case NVME_CTRL_DELETING:
dev_warn_ratelimited(dev->ctrl.device,
"I/O %d QID %d timeout, disable controller\n",
req->tag, nvmeq->qid);
- nvme_dev_disable(dev, false);
+ nvme_dev_disable(dev, true);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
return BLK_EH_DONE;
+ case NVME_CTRL_RESETTING:
+ return BLK_EH_RESET_TIMER;
default:
break;
}
@@ -1230,11 +1349,18 @@
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
- dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
+ dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
- if (nvmeq->sq_cmds)
- dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
- nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+ if (!nvmeq->sq_cmds)
+ return;
+
+ if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
+ pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
+ nvmeq->sq_cmds, SQ_SIZE(nvmeq));
+ } else {
+ dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq),
+ nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+ }
}
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
@@ -1249,51 +1375,42 @@
/**
* nvme_suspend_queue - put queue into suspended state
- * @nvmeq - queue to suspend
+ * @nvmeq: queue to suspend
*/
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
{
- int vector;
-
- spin_lock_irq(&nvmeq->cq_lock);
- if (nvmeq->cq_vector == -1) {
- spin_unlock_irq(&nvmeq->cq_lock);
+ if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
return 1;
- }
- vector = nvmeq->cq_vector;
- nvmeq->dev->online_queues--;
- nvmeq->cq_vector = -1;
- spin_unlock_irq(&nvmeq->cq_lock);
- /*
- * Ensure that nvme_queue_rq() sees it ->cq_vector == -1 without
- * having to grab the lock.
- */
+ /* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
mb();
+ nvmeq->dev->online_queues--;
if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
-
- pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq);
-
+ if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
+ pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
return 0;
}
+static void nvme_suspend_io_queues(struct nvme_dev *dev)
+{
+ int i;
+
+ for (i = dev->ctrl.queue_count - 1; i > 0; i--)
+ nvme_suspend_queue(&dev->queues[i]);
+}
+
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
{
struct nvme_queue *nvmeq = &dev->queues[0];
- u16 start, end;
if (shutdown)
nvme_shutdown_ctrl(&dev->ctrl);
else
- nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
+ nvme_disable_ctrl(&dev->ctrl);
- spin_lock_irq(&nvmeq->cq_lock);
- nvme_process_cq(nvmeq, &start, &end, -1);
- spin_unlock_irq(&nvmeq->cq_lock);
-
- nvme_complete_cqes(nvmeq, start, end);
+ nvme_poll_irqdisable(nvmeq, -1);
}
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
@@ -1321,14 +1438,26 @@
}
static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
- int qid, int depth)
+ int qid)
{
- /* CMB SQEs will be mapped before creation */
- if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS))
- return 0;
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
- nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
- &nvmeq->sq_dma_addr, GFP_KERNEL);
+ if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
+ nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq));
+ if (nvmeq->sq_cmds) {
+ nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
+ nvmeq->sq_cmds);
+ if (nvmeq->sq_dma_addr) {
+ set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
+ return 0;
+ }
+
+ pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq));
+ }
+ }
+
+ nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq),
+ &nvmeq->sq_dma_addr, GFP_KERNEL);
if (!nvmeq->sq_cmds)
return -ENOMEM;
return 0;
@@ -1341,31 +1470,30 @@
if (dev->ctrl.queue_count > qid)
return 0;
- nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
- &nvmeq->cq_dma_addr, GFP_KERNEL);
+ nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
+ nvmeq->q_depth = depth;
+ nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
+ &nvmeq->cq_dma_addr, GFP_KERNEL);
if (!nvmeq->cqes)
goto free_nvmeq;
- if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
+ if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
goto free_cqdma;
- nvmeq->q_dmadev = dev->dev;
nvmeq->dev = dev;
spin_lock_init(&nvmeq->sq_lock);
- spin_lock_init(&nvmeq->cq_lock);
+ spin_lock_init(&nvmeq->cq_poll_lock);
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
- nvmeq->q_depth = depth;
nvmeq->qid = qid;
- nvmeq->cq_vector = -1;
dev->ctrl.queue_count++;
return 0;
free_cqdma:
- dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
- nvmeq->cq_dma_addr);
+ dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
+ nvmeq->cq_dma_addr);
free_nvmeq:
return -ENOMEM;
}
@@ -1388,35 +1516,34 @@
{
struct nvme_dev *dev = nvmeq->dev;
- spin_lock_irq(&nvmeq->cq_lock);
nvmeq->sq_tail = 0;
+ nvmeq->last_sq_tail = 0;
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
- memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
+ memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
nvme_dbbuf_init(dev, nvmeq, qid);
dev->online_queues++;
- spin_unlock_irq(&nvmeq->cq_lock);
+ wmb(); /* ensure the first interrupt sees the initialization */
}
-static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
+static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
{
struct nvme_dev *dev = nvmeq->dev;
int result;
- s16 vector;
+ u16 vector = 0;
- if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
- unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth),
- dev->ctrl.page_size);
- nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
- nvmeq->sq_cmds_io = dev->cmb + offset;
- }
+ clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
/*
* A queue's vector matches the queue identifier unless the controller
* has only one vector available.
*/
- vector = dev->num_vecs == 1 ? 0 : qid;
+ if (!polled)
+ vector = dev->num_vecs == 1 ? 0 : qid;
+ else
+ set_bit(NVMEQ_POLLED, &nvmeq->flags);
+
result = adapter_alloc_cq(dev, qid, nvmeq, vector);
if (result)
return result;
@@ -1427,21 +1554,19 @@
else if (result)
goto release_cq;
- /*
- * Set cq_vector after alloc cq/sq, otherwise nvme_suspend_queue will
- * invoke free_irq for it and cause a 'Trying to free already-free IRQ
- * xxx' warning if the create CQ/SQ command times out.
- */
nvmeq->cq_vector = vector;
nvme_init_queue(nvmeq, qid);
- result = queue_request_irq(nvmeq);
- if (result < 0)
- goto release_sq;
+ if (!polled) {
+ result = queue_request_irq(nvmeq);
+ if (result < 0)
+ goto release_sq;
+ }
+
+ set_bit(NVMEQ_ENABLED, &nvmeq->flags);
return result;
release_sq:
- nvmeq->cq_vector = -1;
dev->online_queues--;
adapter_delete_sq(dev, qid);
release_cq:
@@ -1461,6 +1586,7 @@
static const struct blk_mq_ops nvme_mq_ops = {
.queue_rq = nvme_queue_rq,
.complete = nvme_pci_complete_rq,
+ .commit_rqs = nvme_commit_rqs,
.init_hctx = nvme_init_hctx,
.init_request = nvme_init_request,
.map_queues = nvme_pci_map_queues,
@@ -1491,7 +1617,7 @@
dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
dev->admin_tagset.timeout = ADMIN_TIMEOUT;
dev->admin_tagset.numa_node = dev_to_node(dev->dev);
- dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false);
+ dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
dev->admin_tagset.driver_data = dev;
@@ -1558,7 +1684,7 @@
(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
- result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
+ result = nvme_disable_ctrl(&dev->ctrl);
if (result < 0)
return result;
@@ -1574,7 +1700,7 @@
lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
- result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap);
+ result = nvme_enable_ctrl(&dev->ctrl);
if (result)
return result;
@@ -1582,16 +1708,17 @@
nvme_init_queue(nvmeq, 0);
result = queue_request_irq(nvmeq);
if (result) {
- nvmeq->cq_vector = -1;
+ dev->online_queues--;
return result;
}
+ set_bit(NVMEQ_ENABLED, &nvmeq->flags);
return result;
}
static int nvme_create_io_queues(struct nvme_dev *dev)
{
- unsigned i, max;
+ unsigned i, max, rw_queues;
int ret = 0;
for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
@@ -1602,8 +1729,17 @@
}
max = min(dev->max_qid, dev->ctrl.queue_count - 1);
+ if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
+ rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
+ dev->io_queues[HCTX_TYPE_READ];
+ } else {
+ rw_queues = max;
+ }
+
for (i = dev->online_queues; i <= max; i++) {
- ret = nvme_create_queue(&dev->queues[i], i);
+ bool polled = i > rw_queues;
+
+ ret = nvme_create_queue(&dev->queues[i], i, polled);
if (ret)
break;
}
@@ -1647,14 +1783,14 @@
struct pci_dev *pdev = to_pci_dev(dev->dev);
int bar;
+ if (dev->cmb_size)
+ return;
+
dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
if (!dev->cmbsz)
return;
dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
- if (!use_cmb_sqes)
- return;
-
size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
bar = NVME_CMB_BIR(dev->cmbloc);
@@ -1671,11 +1807,18 @@
if (size > bar_size - offset)
size = bar_size - offset;
- dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
- if (!dev->cmb)
+ if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
+ dev_warn(dev->ctrl.device,
+ "failed to register the CMB\n");
return;
- dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
+ }
+
dev->cmb_size = size;
+ dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);
+
+ if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
+ (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
+ pci_p2pmem_publish(pdev, true);
if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
&dev_attr_cmb.attr, NULL))
@@ -1685,12 +1828,10 @@
static inline void nvme_release_cmb(struct nvme_dev *dev)
{
- if (dev->cmb) {
- iounmap(dev->cmb);
- dev->cmb = NULL;
+ if (dev->cmb_size) {
sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
&dev_attr_cmb.attr, NULL);
- dev->cmbsz = 0;
+ dev->cmb_size = 0;
}
}
@@ -1727,8 +1868,9 @@
struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size;
- dma_free_coherent(dev->dev, size, dev->host_mem_desc_bufs[i],
- le64_to_cpu(desc->addr));
+ dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
+ le64_to_cpu(desc->addr),
+ DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
}
kfree(dev->host_mem_desc_bufs);
@@ -1757,8 +1899,8 @@
if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
max_entries = dev->ctrl.hmmaxd;
- descs = dma_zalloc_coherent(dev->dev, max_entries * sizeof(*descs),
- &descs_dma, GFP_KERNEL);
+ descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs),
+ &descs_dma, GFP_KERNEL);
if (!descs)
goto out;
@@ -1794,8 +1936,9 @@
while (--i >= 0) {
size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size;
- dma_free_coherent(dev->dev, size, bufs[i],
- le64_to_cpu(descs[i].addr));
+ dma_free_attrs(dev->dev, size, bufs[i],
+ le64_to_cpu(descs[i].addr),
+ DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
}
kfree(bufs);
@@ -1870,6 +2013,92 @@
return ret;
}
+/*
+ * nirqs is the number of interrupts available for write and read
+ * queues. The core already reserved an interrupt for the admin queue.
+ */
+static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
+{
+ struct nvme_dev *dev = affd->priv;
+ unsigned int nr_read_queues;
+
+ /*
+ * If there is no interupt available for queues, ensure that
+ * the default queue is set to 1. The affinity set size is
+ * also set to one, but the irq core ignores it for this case.
+ *
+ * If only one interrupt is available or 'write_queue' == 0, combine
+ * write and read queues.
+ *
+ * If 'write_queues' > 0, ensure it leaves room for at least one read
+ * queue.
+ */
+ if (!nrirqs) {
+ nrirqs = 1;
+ nr_read_queues = 0;
+ } else if (nrirqs == 1 || !write_queues) {
+ nr_read_queues = 0;
+ } else if (write_queues >= nrirqs) {
+ nr_read_queues = 1;
+ } else {
+ nr_read_queues = nrirqs - write_queues;
+ }
+
+ dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
+ affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
+ dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
+ affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
+ affd->nr_sets = nr_read_queues ? 2 : 1;
+}
+
+static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
+{
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
+ struct irq_affinity affd = {
+ .pre_vectors = 1,
+ .calc_sets = nvme_calc_irq_sets,
+ .priv = dev,
+ };
+ unsigned int irq_queues, this_p_queues;
+ unsigned int nr_cpus = num_possible_cpus();
+
+ /*
+ * Poll queues don't need interrupts, but we need at least one IO
+ * queue left over for non-polled IO.
+ */
+ this_p_queues = poll_queues;
+ if (this_p_queues >= nr_io_queues) {
+ this_p_queues = nr_io_queues - 1;
+ irq_queues = 1;
+ } else {
+ if (nr_cpus < nr_io_queues - this_p_queues)
+ irq_queues = nr_cpus + 1;
+ else
+ irq_queues = nr_io_queues - this_p_queues + 1;
+ }
+ dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
+
+ /* Initialize for the single interrupt case */
+ dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
+ dev->io_queues[HCTX_TYPE_READ] = 0;
+
+ /*
+ * Some Apple controllers require all queues to use the
+ * first vector.
+ */
+ if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)
+ irq_queues = 1;
+
+ return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
+ PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+}
+
+static void nvme_disable_io_queues(struct nvme_dev *dev)
+{
+ if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
+ __nvme_disable_io_queues(dev, nvme_admin_delete_cq);
+}
+
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
struct nvme_queue *adminq = &dev->queues[0];
@@ -1877,25 +2106,31 @@
int result, nr_io_queues;
unsigned long size;
- struct irq_affinity affd = {
- .pre_vectors = 1
- };
+ nr_io_queues = max_io_queues();
- nr_io_queues = num_possible_cpus();
+ /*
+ * If tags are shared with admin queue (Apple bug), then
+ * make sure we only use one IO queue.
+ */
+ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
+ nr_io_queues = 1;
+
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
if (result < 0)
return result;
if (nr_io_queues == 0)
return 0;
+
+ clear_bit(NVMEQ_ENABLED, &adminq->flags);
- if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) {
+ if (dev->cmb_use_sqes) {
result = nvme_cmb_qdepth(dev, nr_io_queues,
sizeof(struct nvme_command));
if (result > 0)
dev->q_depth = result;
else
- nvme_release_cmb(dev);
+ dev->cmb_use_sqes = false;
}
do {
@@ -1908,6 +2143,7 @@
} while (1);
adminq->q_db = dev->dbs;
+ retry:
/* Deregister the admin queue's interrupt */
pci_free_irq(pdev, 0, adminq);
@@ -1916,12 +2152,14 @@
* setting up the full range we need.
*/
pci_free_irq_vectors(pdev);
- result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1,
- PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+
+ result = nvme_setup_irqs(dev, nr_io_queues);
if (result <= 0)
return -EIO;
+
dev->num_vecs = result;
- dev->max_qid = max(result - 1, 1);
+ result = max(result - 1, 1);
+ dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
/*
* Should investigate if there's a performance win from allocating
@@ -1929,13 +2167,26 @@
* path to scale better, even if the receive path is limited by the
* number of interrupts.
*/
-
result = queue_request_irq(adminq);
- if (result) {
- adminq->cq_vector = -1;
+ if (result)
return result;
+ set_bit(NVMEQ_ENABLED, &adminq->flags);
+
+ result = nvme_create_io_queues(dev);
+ if (result || dev->online_queues < 2)
+ return result;
+
+ if (dev->online_queues - 1 < dev->max_qid) {
+ nr_io_queues = dev->online_queues - 1;
+ nvme_disable_io_queues(dev);
+ nvme_suspend_io_queues(dev);
+ goto retry;
}
- return nvme_create_io_queues(dev);
+ dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
+ dev->io_queues[HCTX_TYPE_DEFAULT],
+ dev->io_queues[HCTX_TYPE_READ],
+ dev->io_queues[HCTX_TYPE_POLL]);
+ return 0;
}
static void nvme_del_queue_end(struct request *req, blk_status_t error)
@@ -1943,23 +2194,15 @@
struct nvme_queue *nvmeq = req->end_io_data;
blk_mq_free_request(req);
- complete(&nvmeq->dev->ioq_wait);
+ complete(&nvmeq->delete_done);
}
static void nvme_del_cq_end(struct request *req, blk_status_t error)
{
struct nvme_queue *nvmeq = req->end_io_data;
- u16 start, end;
- if (!error) {
- unsigned long flags;
-
- spin_lock_irqsave(&nvmeq->cq_lock, flags);
- nvme_process_cq(nvmeq, &start, &end, -1);
- spin_unlock_irqrestore(&nvmeq->cq_lock, flags);
-
- nvme_complete_cqes(nvmeq, start, end);
- }
+ if (error)
+ set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
nvme_del_queue_end(req, error);
}
@@ -1981,70 +2224,79 @@
req->timeout = ADMIN_TIMEOUT;
req->end_io_data = nvmeq;
+ init_completion(&nvmeq->delete_done);
blk_execute_rq_nowait(q, NULL, req, false,
opcode == nvme_admin_delete_cq ?
nvme_del_cq_end : nvme_del_queue_end);
return 0;
}
-static void nvme_disable_io_queues(struct nvme_dev *dev)
+static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
{
- int pass, queues = dev->online_queues - 1;
+ int nr_queues = dev->online_queues - 1, sent = 0;
unsigned long timeout;
- u8 opcode = nvme_admin_delete_sq;
- for (pass = 0; pass < 2; pass++) {
- int sent = 0, i = queues;
-
- reinit_completion(&dev->ioq_wait);
retry:
- timeout = ADMIN_TIMEOUT;
- for (; i > 0; i--, sent++)
- if (nvme_delete_queue(&dev->queues[i], opcode))
- break;
-
- while (sent--) {
- timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout);
- if (timeout == 0)
- return;
- if (i)
- goto retry;
- }
- opcode = nvme_admin_delete_cq;
+ timeout = ADMIN_TIMEOUT;
+ while (nr_queues > 0) {
+ if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
+ break;
+ nr_queues--;
+ sent++;
}
+ while (sent) {
+ struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];
+
+ timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
+ timeout);
+ if (timeout == 0)
+ return false;
+
+ /* handle any remaining CQEs */
+ if (opcode == nvme_admin_delete_cq &&
+ !test_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags))
+ nvme_poll_irqdisable(nvmeq, -1);
+
+ sent--;
+ if (nr_queues)
+ goto retry;
+ }
+ return true;
}
-/*
- * return error value only when tagset allocation failed
- */
-static int nvme_dev_add(struct nvme_dev *dev)
+static void nvme_dev_add(struct nvme_dev *dev)
{
int ret;
if (!dev->ctrl.tagset) {
dev->tagset.ops = &nvme_mq_ops;
dev->tagset.nr_hw_queues = dev->online_queues - 1;
+ dev->tagset.nr_maps = 2; /* default + read */
+ if (dev->io_queues[HCTX_TYPE_POLL])
+ dev->tagset.nr_maps++;
dev->tagset.timeout = NVME_IO_TIMEOUT;
dev->tagset.numa_node = dev_to_node(dev->dev);
dev->tagset.queue_depth =
min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
- dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false);
- if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) {
- dev->tagset.cmd_size = max(dev->tagset.cmd_size,
- nvme_pci_cmd_size(dev, true));
- }
+ dev->tagset.cmd_size = sizeof(struct nvme_iod);
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
+ /*
+ * Some Apple controllers requires tags to be unique
+ * across admin and IO queue, so reserve the first 32
+ * tags of the IO queue.
+ */
+ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
+ dev->tagset.reserved_tags = NVME_AQ_DEPTH;
+
ret = blk_mq_alloc_tag_set(&dev->tagset);
if (ret) {
dev_warn(dev->ctrl.device,
"IO queues tagset allocation failed %d\n", ret);
- return ret;
+ return;
}
dev->ctrl.tagset = &dev->tagset;
-
- nvme_dbbuf_set(dev);
} else {
blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
@@ -2052,7 +2304,7 @@
nvme_free_queues(dev, dev->online_queues);
}
- return 0;
+ nvme_dbbuf_set(dev);
}
static int nvme_pci_enable(struct nvme_dev *dev)
@@ -2065,8 +2317,7 @@
pci_set_master(pdev);
- if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
- dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
+ if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)))
goto disable;
if (readl(dev->bar + NVME_REG_CSTS) == -1) {
@@ -2087,10 +2338,21 @@
dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
io_queue_depth);
+ dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
dev->dbs = dev->bar + 4096;
/*
+ * Some Apple controllers require a non-standard SQE size.
+ * Interestingly they also seem to ignore the CC:IOSQES register
+ * so we don't bother updating it here.
+ */
+ if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
+ dev->io_sqes = 7;
+ else
+ dev->io_sqes = NVME_NVM_IOSQES;
+
+ /*
* Temporary fix for the Apple controller found in the MacBook8,1 and
* some MacBook7,1 to avoid controller resets and data loss.
*/
@@ -2107,6 +2369,18 @@
"set queue depth=%u\n", dev->q_depth);
}
+ /*
+ * Controllers with the shared tags quirk need the IO queue to be
+ * big enough so that we get 32 tags for the admin queue
+ */
+ if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
+ (dev->q_depth < (NVME_AQ_DEPTH + 2))) {
+ dev->q_depth = NVME_AQ_DEPTH + 2;
+ dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
+ dev->q_depth);
+ }
+
+
nvme_map_cmb(dev);
pci_enable_pcie_error_reporting(pdev);
@@ -2129,7 +2403,6 @@
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
- nvme_release_cmb(dev);
pci_free_irq_vectors(pdev);
if (pci_is_enabled(pdev)) {
@@ -2140,8 +2413,7 @@
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{
- int i;
- bool dead = true;
+ bool dead = true, freeze = false;
struct pci_dev *pdev = to_pci_dev(dev->dev);
mutex_lock(&dev->shutdown_lock);
@@ -2149,8 +2421,10 @@
u32 csts = readl(dev->bar + NVME_REG_CSTS);
if (dev->ctrl.state == NVME_CTRL_LIVE ||
- dev->ctrl.state == NVME_CTRL_RESETTING)
+ dev->ctrl.state == NVME_CTRL_RESETTING) {
+ freeze = true;
nvme_start_freeze(&dev->ctrl);
+ }
dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
pdev->error_state != pci_channel_io_normal);
}
@@ -2159,10 +2433,8 @@
* Give the controller a chance to complete all entered requests if
* doing a safe shutdown.
*/
- if (!dead) {
- if (shutdown)
- nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
- }
+ if (!dead && shutdown && freeze)
+ nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
nvme_stop_queues(&dev->ctrl);
@@ -2170,24 +2442,36 @@
nvme_disable_io_queues(dev);
nvme_disable_admin_queue(dev, shutdown);
}
- for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
- nvme_suspend_queue(&dev->queues[i]);
-
+ nvme_suspend_io_queues(dev);
+ nvme_suspend_queue(&dev->queues[0]);
nvme_pci_disable(dev);
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
+ blk_mq_tagset_wait_completed_request(&dev->tagset);
+ blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
/*
* The driver will not be starting up queues again if shutting down so
* must flush all entered requests to their failed completion to avoid
* deadlocking blk-mq hot-cpu notifier.
*/
- if (shutdown)
+ if (shutdown) {
nvme_start_queues(&dev->ctrl);
+ if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
+ blk_mq_unquiesce_queue(dev->ctrl.admin_q);
+ }
mutex_unlock(&dev->shutdown_lock);
}
+static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
+{
+ if (!nvme_wait_reset(&dev->ctrl))
+ return -EBUSY;
+ nvme_dev_disable(dev, shutdown);
+ return 0;
+}
+
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
@@ -2211,14 +2495,20 @@
dma_pool_destroy(dev->prp_small_pool);
}
+static void nvme_free_tagset(struct nvme_dev *dev)
+{
+ if (dev->tagset.tags)
+ blk_mq_free_tag_set(&dev->tagset);
+ dev->ctrl.tagset = NULL;
+}
+
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
nvme_dbbuf_dma_free(dev);
put_device(dev->dev);
- if (dev->tagset.tags)
- blk_mq_free_tag_set(&dev->tagset);
+ nvme_free_tagset(dev);
if (dev->ctrl.admin_q)
blk_put_queue(dev->ctrl.admin_q);
kfree(dev->queues);
@@ -2227,10 +2517,13 @@
kfree(dev);
}
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
{
- dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
-
+ /*
+ * Set state to deleting now to avoid blocking nvme_wait_reset(), which
+ * may be holding this pci_dev's device lock.
+ */
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
nvme_get_ctrl(&dev->ctrl);
nvme_dev_disable(dev, false);
nvme_kill_queues(&dev->ctrl);
@@ -2243,11 +2536,12 @@
struct nvme_dev *dev =
container_of(work, struct nvme_dev, ctrl.reset_work);
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
- int result = -ENODEV;
- enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
+ int result;
- if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
+ if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) {
+ result = -ENODEV;
goto out;
+ }
/*
* If we're called to reset a live controller first shut it down before
@@ -2255,6 +2549,35 @@
*/
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
+ nvme_sync_queues(&dev->ctrl);
+
+ mutex_lock(&dev->shutdown_lock);
+ result = nvme_pci_enable(dev);
+ if (result)
+ goto out_unlock;
+
+ result = nvme_pci_configure_admin_queue(dev);
+ if (result)
+ goto out_unlock;
+
+ result = nvme_alloc_admin_tags(dev);
+ if (result)
+ goto out_unlock;
+
+ /*
+ * Limit the max command size to prevent iod->sg allocations going
+ * over a single page.
+ */
+ dev->ctrl.max_hw_sectors = min_t(u32,
+ NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
+ dev->ctrl.max_segments = NVME_MAX_SEGS;
+
+ /*
+ * Don't limit the IOMMU merged segment size.
+ */
+ dma_set_max_seg_size(dev->dev, 0xffffffff);
+
+ mutex_unlock(&dev->shutdown_lock);
/*
* Introduce CONNECTING state from nvme-fc/rdma transports to mark the
@@ -2263,28 +2586,10 @@
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
dev_warn(dev->ctrl.device,
"failed to mark controller CONNECTING\n");
+ result = -EBUSY;
goto out;
}
- result = nvme_pci_enable(dev);
- if (result)
- goto out;
-
- result = nvme_pci_configure_admin_queue(dev);
- if (result)
- goto out;
-
- result = nvme_alloc_admin_tags(dev);
- if (result)
- goto out;
-
- /*
- * Limit the max command size to prevent iod->sg allocations going
- * over a single page.
- */
- dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1;
- dev->ctrl.max_segments = NVME_MAX_SEGS;
-
result = nvme_init_identify(&dev->ctrl);
if (result)
goto out;
@@ -2325,13 +2630,11 @@
dev_warn(dev->ctrl.device, "IO queues not created\n");
nvme_kill_queues(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
- new_state = NVME_CTRL_ADMIN_ONLY;
+ nvme_free_tagset(dev);
} else {
nvme_start_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
- /* hit this only when allocate tagset fails */
- if (nvme_dev_add(dev))
- new_state = NVME_CTRL_ADMIN_ONLY;
+ nvme_dev_add(dev);
nvme_unfreeze(&dev->ctrl);
}
@@ -2339,17 +2642,23 @@
* If only admin queue live, keep it to do further investigation or
* recovery.
*/
- if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
dev_warn(dev->ctrl.device,
- "failed to mark controller state %d\n", new_state);
+ "failed to mark controller live state\n");
+ result = -ENODEV;
goto out;
}
nvme_start_ctrl(&dev->ctrl);
return;
+ out_unlock:
+ mutex_unlock(&dev->shutdown_lock);
out:
- nvme_remove_dead_ctrl(dev, result);
+ if (result)
+ dev_warn(dev->ctrl.device,
+ "Removing after probe failure status: %d\n", result);
+ nvme_remove_dead_ctrl(dev);
}
static void nvme_remove_dead_ctrl_work(struct work_struct *work)
@@ -2376,7 +2685,7 @@
static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
{
- *val = readq(to_nvme_dev(ctrl)->bar + off);
+ *val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off);
return 0;
}
@@ -2390,7 +2699,8 @@
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name = "pcie",
.module = THIS_MODULE,
- .flags = NVME_F_METADATA_SUPPORTED,
+ .flags = NVME_F_METADATA_SUPPORTED |
+ NVME_F_PCI_P2PDMA,
.reg_read32 = nvme_pci_reg_read32,
.reg_write32 = nvme_pci_reg_write32,
.reg_read64 = nvme_pci_reg_read64,
@@ -2450,7 +2760,7 @@
{
struct nvme_dev *dev = data;
- nvme_reset_ctrl_sync(&dev->ctrl);
+ flush_work(&dev->ctrl.reset_work);
flush_work(&dev->ctrl.scan_work);
nvme_put_ctrl(&dev->ctrl);
}
@@ -2470,8 +2780,8 @@
if (!dev)
return -ENOMEM;
- dev->queues = kcalloc_node(num_possible_cpus() + 1,
- sizeof(struct nvme_queue), GFP_KERNEL, node);
+ dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
+ GFP_KERNEL, node);
if (!dev->queues)
goto free;
@@ -2485,7 +2795,6 @@
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
mutex_init(&dev->shutdown_lock);
- init_completion(&dev->ioq_wait);
result = nvme_setup_prp_pools(dev);
if (result)
@@ -2517,6 +2826,7 @@
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
+ nvme_reset_ctrl(&dev->ctrl);
nvme_get_ctrl(&dev->ctrl);
async_schedule(nvme_async_probe, dev);
@@ -2539,19 +2849,28 @@
static void nvme_reset_prepare(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
- nvme_dev_disable(dev, false);
+
+ /*
+ * We don't need to check the return value from waiting for the reset
+ * state as pci_dev device lock is held, making it impossible to race
+ * with ->remove().
+ */
+ nvme_disable_prepare_reset(dev, false);
+ nvme_sync_queues(&dev->ctrl);
}
static void nvme_reset_done(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
- nvme_reset_ctrl_sync(&dev->ctrl);
+
+ if (!nvme_try_sched_reset(&dev->ctrl))
+ flush_work(&dev->ctrl.reset_work);
}
static void nvme_shutdown(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
- nvme_dev_disable(dev, true);
+ nvme_disable_prepare_reset(dev, true);
}
/*
@@ -2564,19 +2883,19 @@
struct nvme_dev *dev = pci_get_drvdata(pdev);
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
-
- cancel_work_sync(&dev->ctrl.reset_work);
pci_set_drvdata(pdev, NULL);
if (!pci_device_is_present(pdev)) {
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
nvme_dev_disable(dev, true);
+ nvme_dev_remove_admin(dev);
}
flush_work(&dev->ctrl.reset_work);
nvme_stop_ctrl(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
nvme_dev_disable(dev, true);
+ nvme_release_cmb(dev);
nvme_free_host_mem(dev);
nvme_dev_remove_admin(dev);
nvme_free_queues(dev, 0);
@@ -2587,26 +2906,115 @@
}
#ifdef CONFIG_PM_SLEEP
-static int nvme_suspend(struct device *dev)
+static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct nvme_dev *ndev = pci_get_drvdata(pdev);
+ return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps);
+}
- nvme_dev_disable(ndev, true);
- return 0;
+static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps)
+{
+ return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL);
}
static int nvme_resume(struct device *dev)
{
+ struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
+ struct nvme_ctrl *ctrl = &ndev->ctrl;
+
+ if (ndev->last_ps == U32_MAX ||
+ nvme_set_power_state(ctrl, ndev->last_ps) != 0)
+ return nvme_try_sched_reset(&ndev->ctrl);
+ return 0;
+}
+
+static int nvme_suspend(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct nvme_dev *ndev = pci_get_drvdata(pdev);
+ struct nvme_ctrl *ctrl = &ndev->ctrl;
+ int ret = -EBUSY;
+
+ ndev->last_ps = U32_MAX;
+
+ /*
+ * The platform does not remove power for a kernel managed suspend so
+ * use host managed nvme power settings for lowest idle power if
+ * possible. This should have quicker resume latency than a full device
+ * shutdown. But if the firmware is involved after the suspend or the
+ * device does not support any non-default power states, shut down the
+ * device fully.
+ *
+ * If ASPM is not enabled for the device, shut down the device and allow
+ * the PCI bus layer to put it into D3 in order to take the PCIe link
+ * down, so as to allow the platform to achieve its minimum low-power
+ * state (which may not be possible if the link is up).
+ */
+ if (pm_suspend_via_firmware() || !ctrl->npss ||
+ !pcie_aspm_enabled(pdev) ||
+ (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
+ return nvme_disable_prepare_reset(ndev, true);
+
+ nvme_start_freeze(ctrl);
+ nvme_wait_freeze(ctrl);
+ nvme_sync_queues(ctrl);
+
+ if (ctrl->state != NVME_CTRL_LIVE)
+ goto unfreeze;
+
+ ret = nvme_get_power_state(ctrl, &ndev->last_ps);
+ if (ret < 0)
+ goto unfreeze;
+
+ /*
+ * A saved state prevents pci pm from generically controlling the
+ * device's power. If we're using protocol specific settings, we don't
+ * want pci interfering.
+ */
+ pci_save_state(pdev);
+
+ ret = nvme_set_power_state(ctrl, ctrl->npss);
+ if (ret < 0)
+ goto unfreeze;
+
+ if (ret) {
+ /* discard the saved state */
+ pci_load_saved_state(pdev, NULL);
+
+ /*
+ * Clearing npss forces a controller reset on resume. The
+ * correct value will be resdicovered then.
+ */
+ ret = nvme_disable_prepare_reset(ndev, true);
+ ctrl->npss = 0;
+ }
+unfreeze:
+ nvme_unfreeze(ctrl);
+ return ret;
+}
+
+static int nvme_simple_suspend(struct device *dev)
+{
+ struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
+ return nvme_disable_prepare_reset(ndev, true);
+}
+
+static int nvme_simple_resume(struct device *dev)
+{
struct pci_dev *pdev = to_pci_dev(dev);
struct nvme_dev *ndev = pci_get_drvdata(pdev);
- nvme_reset_ctrl(&ndev->ctrl);
- return 0;
+ return nvme_try_sched_reset(&ndev->ctrl);
}
-#endif
-static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
+static const struct dev_pm_ops nvme_dev_pm_ops = {
+ .suspend = nvme_suspend,
+ .resume = nvme_resume,
+ .freeze = nvme_simple_suspend,
+ .thaw = nvme_simple_resume,
+ .poweroff = nvme_simple_suspend,
+ .restore = nvme_simple_resume,
+};
+#endif /* CONFIG_PM_SLEEP */
static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
pci_channel_state_t state)
@@ -2649,7 +3057,6 @@
struct nvme_dev *dev = pci_get_drvdata(pdev);
flush_work(&dev->ctrl.reset_work);
- pci_cleanup_aer_uncorrect_error_status(pdev);
}
static const struct pci_error_handlers nvme_err_handler = {
@@ -2676,8 +3083,11 @@
{ PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_MEDIUM_PRIO_SQ },
+ { PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */
+ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
- .driver_data = NVME_QUIRK_IDENTIFY_CNS, },
+ .driver_data = NVME_QUIRK_IDENTIFY_CNS |
+ NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
{ PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */
@@ -2696,9 +3106,18 @@
.driver_data = NVME_QUIRK_LIGHTNVM, },
{ PCI_DEVICE(0x1d1d, 0x2601), /* CNEX Granby */
.driver_data = NVME_QUIRK_LIGHTNVM, },
+ { PCI_DEVICE(0x10ec, 0x5762), /* ADATA SX6000LNP */
+ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ { PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */
+ .driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+ NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
+ { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
+ .driver_data = NVME_QUIRK_SINGLE_VECTOR |
+ NVME_QUIRK_128_BYTES_SQES |
+ NVME_QUIRK_SHARED_TAGS },
{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);
@@ -2709,15 +3128,21 @@
.probe = nvme_probe,
.remove = nvme_remove,
.shutdown = nvme_shutdown,
+#ifdef CONFIG_PM_SLEEP
.driver = {
.pm = &nvme_dev_pm_ops,
},
+#endif
.sriov_configure = pci_sriov_configure_simple,
.err_handler = &nvme_err_handler,
};
static int __init nvme_init(void)
{
+ BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
+ BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
return pci_register_driver(&nvme_driver);
}
@@ -2725,7 +3150,6 @@
{
pci_unregister_driver(&nvme_driver);
flush_workqueue(nvme_wq);
- _nvme_check_size();
}
MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index b6a28de..cb4c300 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* NVMe over Fabrics RDMA host code.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -119,6 +111,7 @@
struct nvme_ctrl ctrl;
bool use_inline_data;
+ u32 io_queues[HCTX_MAX_TYPES];
};
static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
@@ -162,6 +155,13 @@
return queue - queue->ctrl->queues;
}
+static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue)
+{
+ return nvme_rdma_queue_idx(queue) >
+ queue->ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+ queue->ctrl->io_queues[HCTX_TYPE_READ];
+}
+
static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
{
return queue->cmnd_capsule_len - sizeof(struct nvme_command);
@@ -213,6 +213,11 @@
if (!ring)
return NULL;
+ /*
+ * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
+ * lifetime. It's safe, since any chage in the underlying RDMA device
+ * will issue error recovery and queue re-creation.
+ */
for (i = 0; i < ib_queue_size; i++) {
if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
goto out_free_ring;
@@ -234,8 +239,15 @@
static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
{
- wait_for_completion_interruptible_timeout(&queue->cm_done,
+ int ret;
+
+ ret = wait_for_completion_interruptible_timeout(&queue->cm_done,
msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return -ETIMEDOUT;
+ WARN_ON_ONCE(queue->cm_error > 0);
return queue->cm_error;
}
@@ -267,14 +279,9 @@
static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
struct request *rq, unsigned int hctx_idx)
{
- struct nvme_rdma_ctrl *ctrl = set->driver_data;
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
- int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
- struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
- struct nvme_rdma_device *dev = queue->device;
- nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
- DMA_TO_DEVICE);
+ kfree(req->sqe.data);
}
static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
@@ -285,15 +292,11 @@
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
- struct nvme_rdma_device *dev = queue->device;
- struct ib_device *ibdev = dev->dev;
- int ret;
nvme_req(rq)->ctrl = &ctrl->ctrl;
- ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
- DMA_TO_DEVICE);
- if (ret)
- return ret;
+ req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL);
+ if (!req->sqe.data)
+ return -ENOMEM;
req->queue = queue;
@@ -424,7 +427,7 @@
static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
{
return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
- ibdev->attrs.max_fast_reg_page_list_len);
+ ibdev->attrs.max_fast_reg_page_list_len - 1);
}
static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
@@ -433,7 +436,8 @@
const int send_wr_factor = 3; /* MR, SEND, INV */
const int cq_factor = send_wr_factor + 1; /* + RECV */
int comp_vector, idx = nvme_rdma_queue_idx(queue);
- int ret;
+ enum ib_poll_context poll_ctx;
+ int ret, pages_per_mr;
queue->device = nvme_rdma_find_get_device(queue->cm_id);
if (!queue->device) {
@@ -449,10 +453,16 @@
*/
comp_vector = idx == 0 ? idx : idx - 1;
+ /* Polling queues need direct cq polling context */
+ if (nvme_rdma_poll_queue(queue))
+ poll_ctx = IB_POLL_DIRECT;
+ else
+ poll_ctx = IB_POLL_SOFTIRQ;
+
/* +1 for ib_stop_cq */
queue->ib_cq = ib_alloc_cq(ibdev, queue,
cq_factor * queue->queue_size + 1,
- comp_vector, IB_POLL_SOFTIRQ);
+ comp_vector, poll_ctx);
if (IS_ERR(queue->ib_cq)) {
ret = PTR_ERR(queue->ib_cq);
goto out_put_dev;
@@ -469,10 +479,16 @@
goto out_destroy_qp;
}
+ /*
+ * Currently we don't use SG_GAPS MR's so if the first entry is
+ * misaligned we'll end up using two entries for a single data page,
+ * so one additional entry is required.
+ */
+ pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev) + 1;
ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
queue->queue_size,
IB_MR_TYPE_MEM_REG,
- nvme_rdma_get_max_fr_pages(ibdev));
+ pages_per_mr, 0);
if (ret) {
dev_err(queue->ctrl->ctrl.device,
"failed to initialize MR pool sized %d for QID %d\n",
@@ -552,13 +568,17 @@
return ret;
}
+static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
+{
+ rdma_disconnect(queue->cm_id);
+ ib_drain_qp(queue->qp);
+}
+
static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
{
if (!test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
return;
-
- rdma_disconnect(queue->cm_id);
- ib_drain_qp(queue->qp);
+ __nvme_rdma_stop_queue(queue);
}
static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
@@ -588,18 +608,23 @@
static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
{
+ struct nvme_rdma_queue *queue = &ctrl->queues[idx];
+ bool poll = nvme_rdma_poll_queue(queue);
int ret;
if (idx)
- ret = nvmf_connect_io_queue(&ctrl->ctrl, idx);
+ ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, poll);
else
ret = nvmf_connect_admin_queue(&ctrl->ctrl);
- if (!ret)
- set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[idx].flags);
- else
+ if (!ret) {
+ set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
+ } else {
+ if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
+ __nvme_rdma_stop_queue(queue);
dev_info(ctrl->ctrl.device,
"failed to connect queue: %d ret=%d\n", idx, ret);
+ }
return ret;
}
@@ -625,18 +650,16 @@
{
struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
struct ib_device *ibdev = ctrl->device->dev;
- unsigned int nr_io_queues;
+ unsigned int nr_io_queues, nr_default_queues;
+ unsigned int nr_read_queues, nr_poll_queues;
int i, ret;
- nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
-
- /*
- * we map queues according to the device irq vectors for
- * optimal locality so we don't need more queues than
- * completion vectors.
- */
- nr_io_queues = min_t(unsigned int, nr_io_queues,
- ibdev->num_comp_vectors);
+ nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors,
+ min(opts->nr_io_queues, num_online_cpus()));
+ nr_default_queues = min_t(unsigned int, ibdev->num_comp_vectors,
+ min(opts->nr_write_queues, num_online_cpus()));
+ nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus());
+ nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues;
ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
if (ret)
@@ -649,6 +672,34 @@
dev_info(ctrl->ctrl.device,
"creating %d I/O queues.\n", nr_io_queues);
+ if (opts->nr_write_queues && nr_read_queues < nr_io_queues) {
+ /*
+ * separate read/write queues
+ * hand out dedicated default queues only after we have
+ * sufficient read queues.
+ */
+ ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues;
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
+ min(nr_default_queues, nr_io_queues);
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ } else {
+ /*
+ * shared read/write queues
+ * either no write queues were requested, or we don't have
+ * sufficient queue count to have dedicated default queues.
+ */
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
+ min(nr_read_queues, nr_io_queues);
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ }
+
+ if (opts->nr_poll_queues && nr_io_queues) {
+ /* map dedicated poll queues only if we have queues left */
+ ctrl->io_queues[HCTX_TYPE_POLL] =
+ min(nr_poll_queues, nr_io_queues);
+ }
+
for (i = 1; i < ctrl->ctrl.queue_count; i++) {
ret = nvme_rdma_alloc_queue(ctrl, i,
ctrl->ctrl.sqsize + 1);
@@ -665,15 +716,6 @@
return ret;
}
-static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl,
- struct blk_mq_tag_set *set)
-{
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
-
- blk_mq_free_tag_set(set);
- nvme_rdma_dev_put(ctrl->device);
-}
-
static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
bool admin)
{
@@ -687,7 +729,7 @@
set->ops = &nvme_rdma_admin_mq_ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = 2; /* connect + keep-alive */
- set->numa_node = NUMA_NO_NODE;
+ set->numa_node = nctrl->numa_node;
set->cmd_size = sizeof(struct nvme_rdma_request) +
SG_CHUNK_SIZE * sizeof(struct scatterlist);
set->driver_data = ctrl;
@@ -700,35 +742,21 @@
set->ops = &nvme_rdma_mq_ops;
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = 1; /* fabric connect */
- set->numa_node = NUMA_NO_NODE;
+ set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_SHOULD_MERGE;
set->cmd_size = sizeof(struct nvme_rdma_request) +
SG_CHUNK_SIZE * sizeof(struct scatterlist);
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
+ set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
}
ret = blk_mq_alloc_tag_set(set);
if (ret)
- goto out;
-
- /*
- * We need a reference on the device as long as the tag_set is alive,
- * as the MRs in the request structures need a valid ib_device.
- */
- ret = nvme_rdma_dev_get(ctrl->device);
- if (!ret) {
- ret = -EINVAL;
- goto out_free_tagset;
- }
+ return ERR_PTR(ret);
return set;
-
-out_free_tagset:
- blk_mq_free_tag_set(set);
-out:
- return ERR_PTR(ret);
}
static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
@@ -736,7 +764,8 @@
{
if (remove) {
blk_cleanup_queue(ctrl->ctrl.admin_q);
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
+ blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
}
if (ctrl->async_event_sqe.data) {
nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
@@ -756,9 +785,15 @@
return error;
ctrl->device = ctrl->queues[0].device;
+ ctrl->ctrl.numa_node = dev_to_node(ctrl->device->dev->dma_device);
ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
+ /*
+ * Bind the async event SQE DMA mapping to the admin queue lifetime.
+ * It's safe, since any chage in the underlying RDMA device will issue
+ * error recovery and queue re-creation.
+ */
error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
sizeof(struct nvme_command), DMA_TO_DEVICE);
if (error)
@@ -771,10 +806,16 @@
goto out_free_async_qe;
}
+ ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
+ if (IS_ERR(ctrl->ctrl.fabrics_q)) {
+ error = PTR_ERR(ctrl->ctrl.fabrics_q);
+ goto out_free_tagset;
+ }
+
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.admin_q)) {
error = PTR_ERR(ctrl->ctrl.admin_q);
- goto out_free_tagset;
+ goto out_cleanup_fabrics_q;
}
}
@@ -782,23 +823,14 @@
if (error)
goto out_cleanup_queue;
- error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP,
- &ctrl->ctrl.cap);
- if (error) {
- dev_err(ctrl->ctrl.device,
- "prop_get NVME_REG_CAP failed\n");
- goto out_stop_queue;
- }
-
- ctrl->ctrl.sqsize =
- min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
-
- error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+ error = nvme_enable_ctrl(&ctrl->ctrl);
if (error)
goto out_stop_queue;
- ctrl->ctrl.max_hw_sectors =
- (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
+ ctrl->ctrl.max_segments = ctrl->max_fr_pages;
+ ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9);
+
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
error = nvme_init_identify(&ctrl->ctrl);
if (error)
@@ -811,9 +843,12 @@
out_cleanup_queue:
if (new)
blk_cleanup_queue(ctrl->ctrl.admin_q);
+out_cleanup_fabrics_q:
+ if (new)
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
out_free_tagset:
if (new)
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
out_free_async_qe:
nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
sizeof(struct nvme_command), DMA_TO_DEVICE);
@@ -828,7 +863,7 @@
{
if (remove) {
blk_cleanup_queue(ctrl->ctrl.connect_q);
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.tagset);
}
nvme_rdma_free_io_queues(ctrl);
}
@@ -869,7 +904,7 @@
blk_cleanup_queue(ctrl->ctrl.connect_q);
out_free_tag_set:
if (new)
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.tagset);
out_free_io_queues:
nvme_rdma_free_io_queues(ctrl);
return ret;
@@ -880,9 +915,13 @@
{
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
- blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request,
- &ctrl->ctrl);
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ if (ctrl->ctrl.admin_tagset) {
+ blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset,
+ nvme_cancel_request, &ctrl->ctrl);
+ blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset);
+ }
+ if (remove)
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_destroy_admin_queue(ctrl, remove);
}
@@ -892,22 +931,17 @@
if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
- blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request,
- &ctrl->ctrl);
+ if (ctrl->ctrl.tagset) {
+ blk_mq_tagset_busy_iter(ctrl->ctrl.tagset,
+ nvme_cancel_request, &ctrl->ctrl);
+ blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset);
+ }
if (remove)
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_destroy_io_queues(ctrl, remove);
}
}
-static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
-{
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
-
- cancel_work_sync(&ctrl->err_work);
- cancel_delayed_work_sync(&ctrl->reconnect_work);
-}
-
static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@@ -1038,6 +1072,7 @@
nvme_rdma_teardown_io_queues(ctrl, false);
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_teardown_admin_queue(ctrl, false);
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
@@ -1116,7 +1151,7 @@
struct nvme_rdma_device *dev = queue->device;
struct ib_device *ibdev = dev->dev;
- if (!blk_rq_payload_bytes(rq))
+ if (!blk_rq_nr_phys_segments(rq))
return;
if (req->mr) {
@@ -1124,12 +1159,10 @@
req->mr = NULL;
}
- ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
- req->nents, rq_data_dir(rq) ==
- WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
nvme_cleanup_cmd(rq);
- sg_free_table_chained(&req->sg_table, true);
+ sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
}
static int nvme_rdma_set_sg_null(struct nvme_command *c)
@@ -1239,19 +1272,20 @@
c->common.flags |= NVME_CMD_SGL_METABUF;
- if (!blk_rq_payload_bytes(rq))
+ if (!blk_rq_nr_phys_segments(rq))
return nvme_rdma_set_sg_null(c);
req->sg_table.sgl = req->first_sgl;
ret = sg_alloc_table_chained(&req->sg_table,
- blk_rq_nr_phys_segments(rq), req->sg_table.sgl);
+ blk_rq_nr_phys_segments(rq), req->sg_table.sgl,
+ SG_CHUNK_SIZE);
if (ret)
return -ENOMEM;
req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
- rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ rq_dma_dir(rq));
if (unlikely(count <= 0)) {
ret = -EIO;
goto out_free_table;
@@ -1280,11 +1314,9 @@
return 0;
out_unmap_sg:
- ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
- req->nents, rq_data_dir(rq) ==
- WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
out_free_table:
- sg_free_table_chained(&req->sg_table, true);
+ sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
return ret;
}
@@ -1404,12 +1436,11 @@
WARN_ON_ONCE(ret);
}
-static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
- struct nvme_completion *cqe, struct ib_wc *wc, int tag)
+static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
+ struct nvme_completion *cqe, struct ib_wc *wc)
{
struct request *rq;
struct nvme_rdma_request *req;
- int ret = 0;
rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
if (!rq) {
@@ -1417,7 +1448,7 @@
"tag 0x%x on QP %#x not found\n",
cqe->command_id, queue->qp->qp_num);
nvme_rdma_error_recovery(queue->ctrl);
- return ret;
+ return;
}
req = blk_mq_rq_to_pdu(rq);
@@ -1432,6 +1463,8 @@
nvme_rdma_error_recovery(queue->ctrl);
}
} else if (req->mr) {
+ int ret;
+
ret = nvme_rdma_inv_rkey(queue, req);
if (unlikely(ret < 0)) {
dev_err(queue->ctrl->ctrl.device,
@@ -1440,19 +1473,14 @@
nvme_rdma_error_recovery(queue->ctrl);
}
/* the local invalidation completion will end the request */
- return 0;
+ return;
}
- if (refcount_dec_and_test(&req->ref)) {
- if (rq->tag == tag)
- ret = 1;
+ if (refcount_dec_and_test(&req->ref))
nvme_end_request(rq, req->status, req->result);
- }
-
- return ret;
}
-static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
+static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvme_rdma_qe *qe =
container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
@@ -1460,11 +1488,10 @@
struct ib_device *ibdev = queue->device->dev;
struct nvme_completion *cqe = qe->data;
const size_t len = sizeof(struct nvme_completion);
- int ret = 0;
if (unlikely(wc->status != IB_WC_SUCCESS)) {
nvme_rdma_wr_error(cq, wc, "RECV");
- return 0;
+ return;
}
ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
@@ -1479,16 +1506,10 @@
nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
&cqe->result);
else
- ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag);
+ nvme_rdma_process_nvme_rsp(queue, cqe, wc);
ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
nvme_rdma_post_recv(queue, qe);
- return ret;
-}
-
-static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
-{
- __nvme_rdma_recv_done(cq, wc, -1);
}
static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
@@ -1536,16 +1557,18 @@
static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
{
+ struct nvme_ctrl *ctrl = &queue->ctrl->ctrl;
int ret;
ret = nvme_rdma_create_queue_ib(queue);
if (ret)
return ret;
+ if (ctrl->opts->tos >= 0)
+ rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
if (ret) {
- dev_err(queue->ctrl->ctrl.device,
- "rdma_resolve_route failed (%d).\n",
+ dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
queue->cm_error);
goto out_destroy_queue;
}
@@ -1672,18 +1695,36 @@
nvme_rdma_timeout(struct request *rq, bool reserved)
{
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_rdma_queue *queue = req->queue;
+ struct nvme_rdma_ctrl *ctrl = queue->ctrl;
- dev_warn(req->queue->ctrl->ctrl.device,
- "I/O %d QID %d timeout, reset controller\n",
- rq->tag, nvme_rdma_queue_idx(req->queue));
+ dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
+ rq->tag, nvme_rdma_queue_idx(queue));
- /* queue error recovery */
- nvme_rdma_error_recovery(req->queue->ctrl);
+ /*
+ * Restart the timer if a controller reset is already scheduled. Any
+ * timed out commands would be handled before entering the connecting
+ * state.
+ */
+ if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
+ return BLK_EH_RESET_TIMER;
- /* fail with DNR on cmd timeout */
- nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+ if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
+ /*
+ * Teardown immediately if controller times out while starting
+ * or we are already started error recovery. all outstanding
+ * requests are completed on shutdown, so we return BLK_EH_DONE.
+ */
+ flush_work(&ctrl->err_work);
+ nvme_rdma_teardown_io_queues(ctrl, false);
+ nvme_rdma_teardown_admin_queue(ctrl, false);
+ return BLK_EH_DONE;
+ }
- return BLK_EH_DONE;
+ dev_warn(ctrl->ctrl.device, "starting error recovery\n");
+ nvme_rdma_error_recovery(ctrl);
+
+ return BLK_EH_RESET_TIMER;
}
static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -1706,12 +1747,20 @@
return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
dev = queue->device->dev;
+
+ req->sqe.dma = ib_dma_map_single(dev, req->sqe.data,
+ sizeof(struct nvme_command),
+ DMA_TO_DEVICE);
+ err = ib_dma_mapping_error(dev, req->sqe.dma);
+ if (unlikely(err))
+ return BLK_STS_RESOURCE;
+
ib_dma_sync_single_for_cpu(dev, sqe->dma,
sizeof(struct nvme_command), DMA_TO_DEVICE);
ret = nvme_setup_cmd(ns, rq, c);
if (ret)
- return ret;
+ goto unmap_qe;
blk_mq_start_request(rq);
@@ -1736,46 +1785,82 @@
}
return BLK_STS_OK;
+
err:
if (err == -ENOMEM || err == -EAGAIN)
- return BLK_STS_RESOURCE;
- return BLK_STS_IOERR;
+ ret = BLK_STS_RESOURCE;
+ else
+ ret = BLK_STS_IOERR;
+unmap_qe:
+ ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command),
+ DMA_TO_DEVICE);
+ return ret;
}
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
{
struct nvme_rdma_queue *queue = hctx->driver_data;
- struct ib_cq *cq = queue->ib_cq;
- struct ib_wc wc;
- int found = 0;
- while (ib_poll_cq(cq, 1, &wc) > 0) {
- struct ib_cqe *cqe = wc.wr_cqe;
-
- if (cqe) {
- if (cqe->done == nvme_rdma_recv_done)
- found |= __nvme_rdma_recv_done(cq, &wc, tag);
- else
- cqe->done(cq, &wc);
- }
- }
-
- return found;
+ return ib_process_cq_direct(queue->ib_cq, -1);
}
static void nvme_rdma_complete_rq(struct request *rq)
{
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_rdma_queue *queue = req->queue;
+ struct ib_device *ibdev = queue->device->dev;
- nvme_rdma_unmap_data(req->queue, rq);
+ nvme_rdma_unmap_data(queue, rq);
+ ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command),
+ DMA_TO_DEVICE);
nvme_complete_rq(rq);
}
static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
{
struct nvme_rdma_ctrl *ctrl = set->driver_data;
+ struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
- return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0);
+ if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
+ /* separate read/write queues */
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+ set->map[HCTX_TYPE_READ].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_READ];
+ set->map[HCTX_TYPE_READ].queue_offset =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ } else {
+ /* shared read/write queues */
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+ set->map[HCTX_TYPE_READ].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ set->map[HCTX_TYPE_READ].queue_offset = 0;
+ }
+ blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
+ ctrl->device->dev, 0);
+ blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
+ ctrl->device->dev, 0);
+
+ if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
+ /* map dedicated poll queues only if we have queues left */
+ set->map[HCTX_TYPE_POLL].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_POLL];
+ set->map[HCTX_TYPE_POLL].queue_offset =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+ ctrl->io_queues[HCTX_TYPE_READ];
+ blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
+ }
+
+ dev_info(ctrl->ctrl.device,
+ "mapped %d/%d/%d default/read/poll queues.\n",
+ ctrl->io_queues[HCTX_TYPE_DEFAULT],
+ ctrl->io_queues[HCTX_TYPE_READ],
+ ctrl->io_queues[HCTX_TYPE_POLL]);
+
+ return 0;
}
static const struct blk_mq_ops nvme_rdma_mq_ops = {
@@ -1784,9 +1869,9 @@
.init_request = nvme_rdma_init_request,
.exit_request = nvme_rdma_exit_request,
.init_hctx = nvme_rdma_init_hctx,
- .poll = nvme_rdma_poll,
.timeout = nvme_rdma_timeout,
.map_queues = nvme_rdma_map_queues,
+ .poll = nvme_rdma_poll,
};
static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
@@ -1800,11 +1885,15 @@
static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
{
+ cancel_work_sync(&ctrl->err_work);
+ cancel_delayed_work_sync(&ctrl->reconnect_work);
+
nvme_rdma_teardown_io_queues(ctrl, shutdown);
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
if (shutdown)
nvme_shutdown_ctrl(&ctrl->ctrl);
else
- nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+ nvme_disable_ctrl(&ctrl->ctrl);
nvme_rdma_teardown_admin_queue(ctrl, shutdown);
}
@@ -1848,57 +1937,8 @@
.submit_async_event = nvme_rdma_submit_async_event,
.delete_ctrl = nvme_rdma_delete_ctrl,
.get_address = nvmf_get_address,
- .stop_ctrl = nvme_rdma_stop_ctrl,
};
-static inline bool
-__nvme_rdma_options_match(struct nvme_rdma_ctrl *ctrl,
- struct nvmf_ctrl_options *opts)
-{
- char *stdport = __stringify(NVME_RDMA_IP_PORT);
-
-
- if (!nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts) ||
- strcmp(opts->traddr, ctrl->ctrl.opts->traddr))
- return false;
-
- if (opts->mask & NVMF_OPT_TRSVCID &&
- ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
- if (strcmp(opts->trsvcid, ctrl->ctrl.opts->trsvcid))
- return false;
- } else if (opts->mask & NVMF_OPT_TRSVCID) {
- if (strcmp(opts->trsvcid, stdport))
- return false;
- } else if (ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
- if (strcmp(stdport, ctrl->ctrl.opts->trsvcid))
- return false;
- }
- /* else, it's a match as both have stdport. Fall to next checks */
-
- /*
- * checking the local address is rough. In most cases, one
- * is not specified and the host port is selected by the stack.
- *
- * Assume no match if:
- * local address is specified and address is not the same
- * local address is not specified but remote is, or vice versa
- * (admin using specific host_traddr when it matters).
- */
- if (opts->mask & NVMF_OPT_HOST_TRADDR &&
- ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
- if (strcmp(opts->host_traddr, ctrl->ctrl.opts->host_traddr))
- return false;
- } else if (opts->mask & NVMF_OPT_HOST_TRADDR ||
- ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
- return false;
- /*
- * if neither controller had an host port specified, assume it's
- * a match as everything else matched.
- */
-
- return true;
-}
-
/*
* Fails a connection request if it matches an existing controller
* (association) with the same tuple:
@@ -1919,7 +1959,7 @@
mutex_lock(&nvme_rdma_ctrl_mutex);
list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
- found = __nvme_rdma_options_match(ctrl, opts);
+ found = nvmf_ip_options_match(&ctrl->ctrl, opts);
if (found)
break;
}
@@ -1934,7 +1974,6 @@
struct nvme_rdma_ctrl *ctrl;
int ret;
bool changed;
- char *port;
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
if (!ctrl)
@@ -1942,15 +1981,21 @@
ctrl->ctrl.opts = opts;
INIT_LIST_HEAD(&ctrl->list);
- if (opts->mask & NVMF_OPT_TRSVCID)
- port = opts->trsvcid;
- else
- port = __stringify(NVME_RDMA_IP_PORT);
+ if (!(opts->mask & NVMF_OPT_TRSVCID)) {
+ opts->trsvcid =
+ kstrdup(__stringify(NVME_RDMA_IP_PORT), GFP_KERNEL);
+ if (!opts->trsvcid) {
+ ret = -ENOMEM;
+ goto out_free_ctrl;
+ }
+ opts->mask |= NVMF_OPT_TRSVCID;
+ }
ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
- opts->traddr, port, &ctrl->addr);
+ opts->traddr, opts->trsvcid, &ctrl->addr);
if (ret) {
- pr_err("malformed address passed: %s:%s\n", opts->traddr, port);
+ pr_err("malformed address passed: %s:%s\n",
+ opts->traddr, opts->trsvcid);
goto out_free_ctrl;
}
@@ -1974,7 +2019,8 @@
INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
- ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
+ ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
+ opts->nr_poll_queues + 1;
ctrl->ctrl.sqsize = opts->queue_size - 1;
ctrl->ctrl.kato = opts->kato;
@@ -2025,7 +2071,9 @@
.module = THIS_MODULE,
.required_opts = NVMF_OPT_TRADDR,
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
- NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
+ NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
+ NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
+ NVMF_OPT_TOS,
.create_ctrl = nvme_rdma_create_ctrl,
};
@@ -2085,8 +2133,16 @@
static void __exit nvme_rdma_cleanup_module(void)
{
+ struct nvme_rdma_ctrl *ctrl;
+
nvmf_unregister_transport(&nvme_rdma_transport);
ib_unregister_client(&nvme_rdma_ib_client);
+
+ mutex_lock(&nvme_rdma_ctrl_mutex);
+ list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list)
+ nvme_delete_ctrl(&ctrl->ctrl);
+ mutex_unlock(&nvme_rdma_ctrl_mutex);
+ flush_workqueue(nvme_delete_wq);
}
module_init(nvme_rdma_init_module);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
new file mode 100644
index 0000000..7544be8
--- /dev/null
+++ b/drivers/nvme/host/tcp.c
@@ -0,0 +1,2419 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics TCP host.
+ * Copyright (c) 2018 Lightbits Labs. All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/nvme-tcp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/blk-mq.h>
+#include <crypto/hash.h>
+#include <net/busy_poll.h>
+
+#include "nvme.h"
+#include "fabrics.h"
+
+struct nvme_tcp_queue;
+
+enum nvme_tcp_send_state {
+ NVME_TCP_SEND_CMD_PDU = 0,
+ NVME_TCP_SEND_H2C_PDU,
+ NVME_TCP_SEND_DATA,
+ NVME_TCP_SEND_DDGST,
+};
+
+struct nvme_tcp_request {
+ struct nvme_request req;
+ void *pdu;
+ struct nvme_tcp_queue *queue;
+ u32 data_len;
+ u32 pdu_len;
+ u32 pdu_sent;
+ u16 ttag;
+ struct list_head entry;
+ __le32 ddgst;
+
+ struct bio *curr_bio;
+ struct iov_iter iter;
+
+ /* send state */
+ size_t offset;
+ size_t data_sent;
+ enum nvme_tcp_send_state state;
+};
+
+enum nvme_tcp_queue_flags {
+ NVME_TCP_Q_ALLOCATED = 0,
+ NVME_TCP_Q_LIVE = 1,
+};
+
+enum nvme_tcp_recv_state {
+ NVME_TCP_RECV_PDU = 0,
+ NVME_TCP_RECV_DATA,
+ NVME_TCP_RECV_DDGST,
+};
+
+struct nvme_tcp_ctrl;
+struct nvme_tcp_queue {
+ struct socket *sock;
+ struct work_struct io_work;
+ int io_cpu;
+
+ spinlock_t lock;
+ struct list_head send_list;
+
+ /* recv state */
+ void *pdu;
+ int pdu_remaining;
+ int pdu_offset;
+ size_t data_remaining;
+ size_t ddgst_remaining;
+ unsigned int nr_cqe;
+
+ /* send state */
+ struct nvme_tcp_request *request;
+
+ int queue_size;
+ size_t cmnd_capsule_len;
+ struct nvme_tcp_ctrl *ctrl;
+ unsigned long flags;
+ bool rd_enabled;
+
+ bool hdr_digest;
+ bool data_digest;
+ struct ahash_request *rcv_hash;
+ struct ahash_request *snd_hash;
+ __le32 exp_ddgst;
+ __le32 recv_ddgst;
+
+ struct page_frag_cache pf_cache;
+
+ void (*state_change)(struct sock *);
+ void (*data_ready)(struct sock *);
+ void (*write_space)(struct sock *);
+};
+
+struct nvme_tcp_ctrl {
+ /* read only in the hot path */
+ struct nvme_tcp_queue *queues;
+ struct blk_mq_tag_set tag_set;
+
+ /* other member variables */
+ struct list_head list;
+ struct blk_mq_tag_set admin_tag_set;
+ struct sockaddr_storage addr;
+ struct sockaddr_storage src_addr;
+ struct nvme_ctrl ctrl;
+
+ struct work_struct err_work;
+ struct delayed_work connect_work;
+ struct nvme_tcp_request async_req;
+ u32 io_queues[HCTX_MAX_TYPES];
+};
+
+static LIST_HEAD(nvme_tcp_ctrl_list);
+static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
+static struct workqueue_struct *nvme_tcp_wq;
+static struct blk_mq_ops nvme_tcp_mq_ops;
+static struct blk_mq_ops nvme_tcp_admin_mq_ops;
+
+static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
+{
+ return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
+}
+
+static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
+{
+ return queue - queue->ctrl->queues;
+}
+
+static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
+{
+ u32 queue_idx = nvme_tcp_queue_id(queue);
+
+ if (queue_idx == 0)
+ return queue->ctrl->admin_tag_set.tags[queue_idx];
+ return queue->ctrl->tag_set.tags[queue_idx - 1];
+}
+
+static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
+{
+ return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
+{
+ return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
+{
+ return queue->cmnd_capsule_len - sizeof(struct nvme_command);
+}
+
+static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
+{
+ return req == &req->queue->ctrl->async_req;
+}
+
+static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
+{
+ struct request *rq;
+ unsigned int bytes;
+
+ if (unlikely(nvme_tcp_async_req(req)))
+ return false; /* async events don't have a request */
+
+ rq = blk_mq_rq_from_pdu(req);
+ bytes = blk_rq_payload_bytes(rq);
+
+ return rq_data_dir(rq) == WRITE && bytes &&
+ bytes <= nvme_tcp_inline_data_size(req->queue);
+}
+
+static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
+{
+ return req->iter.bvec->bv_page;
+}
+
+static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
+{
+ return req->iter.bvec->bv_offset + req->iter.iov_offset;
+}
+
+static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
+{
+ return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
+ req->pdu_len - req->pdu_sent);
+}
+
+static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req)
+{
+ return req->iter.iov_offset;
+}
+
+static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
+{
+ return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
+ req->pdu_len - req->pdu_sent : 0;
+}
+
+static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
+ int len)
+{
+ return nvme_tcp_pdu_data_left(req) <= len;
+}
+
+static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
+ unsigned int dir)
+{
+ struct request *rq = blk_mq_rq_from_pdu(req);
+ struct bio_vec *vec;
+ unsigned int size;
+ int nsegs;
+ size_t offset;
+
+ if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
+ vec = &rq->special_vec;
+ nsegs = 1;
+ size = blk_rq_payload_bytes(rq);
+ offset = 0;
+ } else {
+ struct bio *bio = req->curr_bio;
+
+ vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+ nsegs = bio_segments(bio);
+ size = bio->bi_iter.bi_size;
+ offset = bio->bi_iter.bi_bvec_done;
+ }
+
+ iov_iter_bvec(&req->iter, dir, vec, nsegs, size);
+ req->iter.iov_offset = offset;
+}
+
+static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
+ int len)
+{
+ req->data_sent += len;
+ req->pdu_sent += len;
+ iov_iter_advance(&req->iter, len);
+ if (!iov_iter_count(&req->iter) &&
+ req->data_sent < req->data_len) {
+ req->curr_bio = req->curr_bio->bi_next;
+ nvme_tcp_init_iter(req, WRITE);
+ }
+}
+
+static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req)
+{
+ struct nvme_tcp_queue *queue = req->queue;
+
+ spin_lock(&queue->lock);
+ list_add_tail(&req->entry, &queue->send_list);
+ spin_unlock(&queue->lock);
+
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+}
+
+static inline struct nvme_tcp_request *
+nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_request *req;
+
+ spin_lock(&queue->lock);
+ req = list_first_entry_or_null(&queue->send_list,
+ struct nvme_tcp_request, entry);
+ if (req)
+ list_del(&req->entry);
+ spin_unlock(&queue->lock);
+
+ return req;
+}
+
+static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
+ __le32 *dgst)
+{
+ ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
+ crypto_ahash_final(hash);
+}
+
+static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
+ struct page *page, off_t off, size_t len)
+{
+ struct scatterlist sg;
+
+ sg_init_marker(&sg, 1);
+ sg_set_page(&sg, page, len, off);
+ ahash_request_set_crypt(hash, &sg, NULL, len);
+ crypto_ahash_update(hash);
+}
+
+static inline void nvme_tcp_hdgst(struct ahash_request *hash,
+ void *pdu, size_t len)
+{
+ struct scatterlist sg;
+
+ sg_init_one(&sg, pdu, len);
+ ahash_request_set_crypt(hash, &sg, pdu + len, len);
+ crypto_ahash_digest(hash);
+}
+
+static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
+ void *pdu, size_t pdu_len)
+{
+ struct nvme_tcp_hdr *hdr = pdu;
+ __le32 recv_digest;
+ __le32 exp_digest;
+
+ if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d: header digest flag is cleared\n",
+ nvme_tcp_queue_id(queue));
+ return -EPROTO;
+ }
+
+ recv_digest = *(__le32 *)(pdu + hdr->hlen);
+ nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
+ exp_digest = *(__le32 *)(pdu + hdr->hlen);
+ if (recv_digest != exp_digest) {
+ dev_err(queue->ctrl->ctrl.device,
+ "header digest error: recv %#x expected %#x\n",
+ le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
+{
+ struct nvme_tcp_hdr *hdr = pdu;
+ u8 digest_len = nvme_tcp_hdgst_len(queue);
+ u32 len;
+
+ len = le32_to_cpu(hdr->plen) - hdr->hlen -
+ ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
+
+ if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d: data digest flag is cleared\n",
+ nvme_tcp_queue_id(queue));
+ return -EPROTO;
+ }
+ crypto_ahash_init(queue->rcv_hash);
+
+ return 0;
+}
+
+static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
+ struct request *rq, unsigned int hctx_idx)
+{
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+
+ page_frag_free(req->pdu);
+}
+
+static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
+ struct request *rq, unsigned int hctx_idx,
+ unsigned int numa_node)
+{
+ struct nvme_tcp_ctrl *ctrl = set->driver_data;
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+ int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
+ struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
+ u8 hdgst = nvme_tcp_hdgst_len(queue);
+
+ req->pdu = page_frag_alloc(&queue->pf_cache,
+ sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!req->pdu)
+ return -ENOMEM;
+
+ req->queue = queue;
+ nvme_req(rq)->ctrl = &ctrl->ctrl;
+
+ return 0;
+}
+
+static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int hctx_idx)
+{
+ struct nvme_tcp_ctrl *ctrl = data;
+ struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
+
+ hctx->driver_data = queue;
+ return 0;
+}
+
+static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int hctx_idx)
+{
+ struct nvme_tcp_ctrl *ctrl = data;
+ struct nvme_tcp_queue *queue = &ctrl->queues[0];
+
+ hctx->driver_data = queue;
+ return 0;
+}
+
+static enum nvme_tcp_recv_state
+nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
+{
+ return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
+ (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
+ NVME_TCP_RECV_DATA;
+}
+
+static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
+{
+ queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
+ nvme_tcp_hdgst_len(queue);
+ queue->pdu_offset = 0;
+ queue->data_remaining = -1;
+ queue->ddgst_remaining = 0;
+}
+
+static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
+{
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+ return;
+
+ queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->err_work);
+}
+
+static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
+ struct nvme_completion *cqe)
+{
+ struct request *rq;
+
+ rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
+ if (!rq) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d tag 0x%x not found\n",
+ nvme_tcp_queue_id(queue), cqe->command_id);
+ nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+ return -EINVAL;
+ }
+
+ nvme_end_request(rq, cqe->status, cqe->result);
+ queue->nr_cqe++;
+
+ return 0;
+}
+
+static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
+ struct nvme_tcp_data_pdu *pdu)
+{
+ struct request *rq;
+
+ rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+ if (!rq) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d tag %#x not found\n",
+ nvme_tcp_queue_id(queue), pdu->command_id);
+ return -ENOENT;
+ }
+
+ if (!blk_rq_payload_bytes(rq)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d tag %#x unexpected data\n",
+ nvme_tcp_queue_id(queue), rq->tag);
+ return -EIO;
+ }
+
+ queue->data_remaining = le32_to_cpu(pdu->data_length);
+
+ if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
+ unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d tag %#x SUCCESS set but not last PDU\n",
+ nvme_tcp_queue_id(queue), rq->tag);
+ nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+ return -EPROTO;
+ }
+
+ return 0;
+}
+
+static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
+ struct nvme_tcp_rsp_pdu *pdu)
+{
+ struct nvme_completion *cqe = &pdu->cqe;
+ int ret = 0;
+
+ /*
+ * AEN requests are special as they don't time out and can
+ * survive any kind of queue freeze and often don't respond to
+ * aborts. We don't even bother to allocate a struct request
+ * for them but rather special case them here.
+ */
+ if (unlikely(nvme_tcp_queue_id(queue) == 0 &&
+ cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
+ nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
+ &cqe->result);
+ else
+ ret = nvme_tcp_process_nvme_cqe(queue, cqe);
+
+ return ret;
+}
+
+static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
+ struct nvme_tcp_r2t_pdu *pdu)
+{
+ struct nvme_tcp_data_pdu *data = req->pdu;
+ struct nvme_tcp_queue *queue = req->queue;
+ struct request *rq = blk_mq_rq_from_pdu(req);
+ u8 hdgst = nvme_tcp_hdgst_len(queue);
+ u8 ddgst = nvme_tcp_ddgst_len(queue);
+
+ req->pdu_len = le32_to_cpu(pdu->r2t_length);
+ req->pdu_sent = 0;
+
+ if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "req %d r2t len %u exceeded data len %u (%zu sent)\n",
+ rq->tag, req->pdu_len, req->data_len,
+ req->data_sent);
+ return -EPROTO;
+ }
+
+ if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "req %d unexpected r2t offset %u (expected %zu)\n",
+ rq->tag, le32_to_cpu(pdu->r2t_offset),
+ req->data_sent);
+ return -EPROTO;
+ }
+
+ memset(data, 0, sizeof(*data));
+ data->hdr.type = nvme_tcp_h2c_data;
+ data->hdr.flags = NVME_TCP_F_DATA_LAST;
+ if (queue->hdr_digest)
+ data->hdr.flags |= NVME_TCP_F_HDGST;
+ if (queue->data_digest)
+ data->hdr.flags |= NVME_TCP_F_DDGST;
+ data->hdr.hlen = sizeof(*data);
+ data->hdr.pdo = data->hdr.hlen + hdgst;
+ data->hdr.plen =
+ cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
+ data->ttag = pdu->ttag;
+ data->command_id = rq->tag;
+ data->data_offset = cpu_to_le32(req->data_sent);
+ data->data_length = cpu_to_le32(req->pdu_len);
+ return 0;
+}
+
+static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
+ struct nvme_tcp_r2t_pdu *pdu)
+{
+ struct nvme_tcp_request *req;
+ struct request *rq;
+ int ret;
+
+ rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+ if (!rq) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d tag %#x not found\n",
+ nvme_tcp_queue_id(queue), pdu->command_id);
+ return -ENOENT;
+ }
+ req = blk_mq_rq_to_pdu(rq);
+
+ ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
+ if (unlikely(ret))
+ return ret;
+
+ req->state = NVME_TCP_SEND_H2C_PDU;
+ req->offset = 0;
+
+ nvme_tcp_queue_request(req);
+
+ return 0;
+}
+
+static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
+ unsigned int *offset, size_t *len)
+{
+ struct nvme_tcp_hdr *hdr;
+ char *pdu = queue->pdu;
+ size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
+ int ret;
+
+ ret = skb_copy_bits(skb, *offset,
+ &pdu[queue->pdu_offset], rcv_len);
+ if (unlikely(ret))
+ return ret;
+
+ queue->pdu_remaining -= rcv_len;
+ queue->pdu_offset += rcv_len;
+ *offset += rcv_len;
+ *len -= rcv_len;
+ if (queue->pdu_remaining)
+ return 0;
+
+ hdr = queue->pdu;
+ if (queue->hdr_digest) {
+ ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
+ if (unlikely(ret))
+ return ret;
+ }
+
+
+ if (queue->data_digest) {
+ ret = nvme_tcp_check_ddgst(queue, queue->pdu);
+ if (unlikely(ret))
+ return ret;
+ }
+
+ switch (hdr->type) {
+ case nvme_tcp_c2h_data:
+ return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
+ case nvme_tcp_rsp:
+ nvme_tcp_init_recv_ctx(queue);
+ return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
+ case nvme_tcp_r2t:
+ nvme_tcp_init_recv_ctx(queue);
+ return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
+ default:
+ dev_err(queue->ctrl->ctrl.device,
+ "unsupported pdu type (%d)\n", hdr->type);
+ return -EINVAL;
+ }
+}
+
+static inline void nvme_tcp_end_request(struct request *rq, u16 status)
+{
+ union nvme_result res = {};
+
+ nvme_end_request(rq, cpu_to_le16(status << 1), res);
+}
+
+static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
+ unsigned int *offset, size_t *len)
+{
+ struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
+ struct nvme_tcp_request *req;
+ struct request *rq;
+
+ rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+ if (!rq) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d tag %#x not found\n",
+ nvme_tcp_queue_id(queue), pdu->command_id);
+ return -ENOENT;
+ }
+ req = blk_mq_rq_to_pdu(rq);
+
+ while (true) {
+ int recv_len, ret;
+
+ recv_len = min_t(size_t, *len, queue->data_remaining);
+ if (!recv_len)
+ break;
+
+ if (!iov_iter_count(&req->iter)) {
+ req->curr_bio = req->curr_bio->bi_next;
+
+ /*
+ * If we don`t have any bios it means that controller
+ * sent more data than we requested, hence error
+ */
+ if (!req->curr_bio) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d no space in request %#x",
+ nvme_tcp_queue_id(queue), rq->tag);
+ nvme_tcp_init_recv_ctx(queue);
+ return -EIO;
+ }
+ nvme_tcp_init_iter(req, READ);
+ }
+
+ /* we can read only from what is left in this bio */
+ recv_len = min_t(size_t, recv_len,
+ iov_iter_count(&req->iter));
+
+ if (queue->data_digest)
+ ret = skb_copy_and_hash_datagram_iter(skb, *offset,
+ &req->iter, recv_len, queue->rcv_hash);
+ else
+ ret = skb_copy_datagram_iter(skb, *offset,
+ &req->iter, recv_len);
+ if (ret) {
+ dev_err(queue->ctrl->ctrl.device,
+ "queue %d failed to copy request %#x data",
+ nvme_tcp_queue_id(queue), rq->tag);
+ return ret;
+ }
+
+ *len -= recv_len;
+ *offset += recv_len;
+ queue->data_remaining -= recv_len;
+ }
+
+ if (!queue->data_remaining) {
+ if (queue->data_digest) {
+ nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
+ queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
+ } else {
+ if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
+ nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
+ queue->nr_cqe++;
+ }
+ nvme_tcp_init_recv_ctx(queue);
+ }
+ }
+
+ return 0;
+}
+
+static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
+ struct sk_buff *skb, unsigned int *offset, size_t *len)
+{
+ struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
+ char *ddgst = (char *)&queue->recv_ddgst;
+ size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
+ off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
+ int ret;
+
+ ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
+ if (unlikely(ret))
+ return ret;
+
+ queue->ddgst_remaining -= recv_len;
+ *offset += recv_len;
+ *len -= recv_len;
+ if (queue->ddgst_remaining)
+ return 0;
+
+ if (queue->recv_ddgst != queue->exp_ddgst) {
+ dev_err(queue->ctrl->ctrl.device,
+ "data digest error: recv %#x expected %#x\n",
+ le32_to_cpu(queue->recv_ddgst),
+ le32_to_cpu(queue->exp_ddgst));
+ return -EIO;
+ }
+
+ if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
+ struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue),
+ pdu->command_id);
+
+ nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
+ queue->nr_cqe++;
+ }
+
+ nvme_tcp_init_recv_ctx(queue);
+ return 0;
+}
+
+static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
+{
+ struct nvme_tcp_queue *queue = desc->arg.data;
+ size_t consumed = len;
+ int result;
+
+ while (len) {
+ switch (nvme_tcp_recv_state(queue)) {
+ case NVME_TCP_RECV_PDU:
+ result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
+ break;
+ case NVME_TCP_RECV_DATA:
+ result = nvme_tcp_recv_data(queue, skb, &offset, &len);
+ break;
+ case NVME_TCP_RECV_DDGST:
+ result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
+ break;
+ default:
+ result = -EFAULT;
+ }
+ if (result) {
+ dev_err(queue->ctrl->ctrl.device,
+ "receive failed: %d\n", result);
+ queue->rd_enabled = false;
+ nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+ return result;
+ }
+ }
+
+ return consumed;
+}
+
+static void nvme_tcp_data_ready(struct sock *sk)
+{
+ struct nvme_tcp_queue *queue;
+
+ read_lock(&sk->sk_callback_lock);
+ queue = sk->sk_user_data;
+ if (likely(queue && queue->rd_enabled))
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+ read_unlock(&sk->sk_callback_lock);
+}
+
+static void nvme_tcp_write_space(struct sock *sk)
+{
+ struct nvme_tcp_queue *queue;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ queue = sk->sk_user_data;
+ if (likely(queue && sk_stream_is_writeable(sk))) {
+ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+ }
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void nvme_tcp_state_change(struct sock *sk)
+{
+ struct nvme_tcp_queue *queue;
+
+ read_lock(&sk->sk_callback_lock);
+ queue = sk->sk_user_data;
+ if (!queue)
+ goto done;
+
+ switch (sk->sk_state) {
+ case TCP_CLOSE:
+ case TCP_CLOSE_WAIT:
+ case TCP_LAST_ACK:
+ case TCP_FIN_WAIT1:
+ case TCP_FIN_WAIT2:
+ /* fallthrough */
+ nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+ break;
+ default:
+ dev_info(queue->ctrl->ctrl.device,
+ "queue %d socket state %d\n",
+ nvme_tcp_queue_id(queue), sk->sk_state);
+ }
+
+ queue->state_change(sk);
+done:
+ read_unlock(&sk->sk_callback_lock);
+}
+
+static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
+{
+ queue->request = NULL;
+}
+
+static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
+{
+ nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);
+}
+
+static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
+{
+ struct nvme_tcp_queue *queue = req->queue;
+
+ while (true) {
+ struct page *page = nvme_tcp_req_cur_page(req);
+ size_t offset = nvme_tcp_req_cur_offset(req);
+ size_t len = nvme_tcp_req_cur_length(req);
+ bool last = nvme_tcp_pdu_last_send(req, len);
+ int ret, flags = MSG_DONTWAIT;
+
+ if (last && !queue->data_digest)
+ flags |= MSG_EOR;
+ else
+ flags |= MSG_MORE;
+
+ /* can't zcopy slab pages */
+ if (unlikely(PageSlab(page))) {
+ ret = sock_no_sendpage(queue->sock, page, offset, len,
+ flags);
+ } else {
+ ret = kernel_sendpage(queue->sock, page, offset, len,
+ flags);
+ }
+ if (ret <= 0)
+ return ret;
+
+ nvme_tcp_advance_req(req, ret);
+ if (queue->data_digest)
+ nvme_tcp_ddgst_update(queue->snd_hash, page,
+ offset, ret);
+
+ /* fully successful last write*/
+ if (last && ret == len) {
+ if (queue->data_digest) {
+ nvme_tcp_ddgst_final(queue->snd_hash,
+ &req->ddgst);
+ req->state = NVME_TCP_SEND_DDGST;
+ req->offset = 0;
+ } else {
+ nvme_tcp_done_send_req(queue);
+ }
+ return 1;
+ }
+ }
+ return -EAGAIN;
+}
+
+static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
+{
+ struct nvme_tcp_queue *queue = req->queue;
+ struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+ bool inline_data = nvme_tcp_has_inline_data(req);
+ int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR);
+ u8 hdgst = nvme_tcp_hdgst_len(queue);
+ int len = sizeof(*pdu) + hdgst - req->offset;
+ int ret;
+
+ if (queue->hdr_digest && !req->offset)
+ nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+
+ ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
+ offset_in_page(pdu) + req->offset, len, flags);
+ if (unlikely(ret <= 0))
+ return ret;
+
+ len -= ret;
+ if (!len) {
+ if (inline_data) {
+ req->state = NVME_TCP_SEND_DATA;
+ if (queue->data_digest)
+ crypto_ahash_init(queue->snd_hash);
+ nvme_tcp_init_iter(req, WRITE);
+ } else {
+ nvme_tcp_done_send_req(queue);
+ }
+ return 1;
+ }
+ req->offset += ret;
+
+ return -EAGAIN;
+}
+
+static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
+{
+ struct nvme_tcp_queue *queue = req->queue;
+ struct nvme_tcp_data_pdu *pdu = req->pdu;
+ u8 hdgst = nvme_tcp_hdgst_len(queue);
+ int len = sizeof(*pdu) - req->offset + hdgst;
+ int ret;
+
+ if (queue->hdr_digest && !req->offset)
+ nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+
+ ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
+ offset_in_page(pdu) + req->offset, len,
+ MSG_DONTWAIT | MSG_MORE);
+ if (unlikely(ret <= 0))
+ return ret;
+
+ len -= ret;
+ if (!len) {
+ req->state = NVME_TCP_SEND_DATA;
+ if (queue->data_digest)
+ crypto_ahash_init(queue->snd_hash);
+ if (!req->data_sent)
+ nvme_tcp_init_iter(req, WRITE);
+ return 1;
+ }
+ req->offset += ret;
+
+ return -EAGAIN;
+}
+
+static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
+{
+ struct nvme_tcp_queue *queue = req->queue;
+ int ret;
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
+ struct kvec iov = {
+ .iov_base = &req->ddgst + req->offset,
+ .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
+ };
+
+ ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+ if (unlikely(ret <= 0))
+ return ret;
+
+ if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
+ nvme_tcp_done_send_req(queue);
+ return 1;
+ }
+
+ req->offset += ret;
+ return -EAGAIN;
+}
+
+static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_request *req;
+ int ret = 1;
+
+ if (!queue->request) {
+ queue->request = nvme_tcp_fetch_request(queue);
+ if (!queue->request)
+ return 0;
+ }
+ req = queue->request;
+
+ if (req->state == NVME_TCP_SEND_CMD_PDU) {
+ ret = nvme_tcp_try_send_cmd_pdu(req);
+ if (ret <= 0)
+ goto done;
+ if (!nvme_tcp_has_inline_data(req))
+ return ret;
+ }
+
+ if (req->state == NVME_TCP_SEND_H2C_PDU) {
+ ret = nvme_tcp_try_send_data_pdu(req);
+ if (ret <= 0)
+ goto done;
+ }
+
+ if (req->state == NVME_TCP_SEND_DATA) {
+ ret = nvme_tcp_try_send_data(req);
+ if (ret <= 0)
+ goto done;
+ }
+
+ if (req->state == NVME_TCP_SEND_DDGST)
+ ret = nvme_tcp_try_send_ddgst(req);
+done:
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+}
+
+static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
+{
+ struct socket *sock = queue->sock;
+ struct sock *sk = sock->sk;
+ read_descriptor_t rd_desc;
+ int consumed;
+
+ rd_desc.arg.data = queue;
+ rd_desc.count = 1;
+ lock_sock(sk);
+ queue->nr_cqe = 0;
+ consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
+ release_sock(sk);
+ return consumed;
+}
+
+static void nvme_tcp_io_work(struct work_struct *w)
+{
+ struct nvme_tcp_queue *queue =
+ container_of(w, struct nvme_tcp_queue, io_work);
+ unsigned long deadline = jiffies + msecs_to_jiffies(1);
+
+ do {
+ bool pending = false;
+ int result;
+
+ result = nvme_tcp_try_send(queue);
+ if (result > 0) {
+ pending = true;
+ } else if (unlikely(result < 0)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "failed to send request %d\n", result);
+ if (result != -EPIPE)
+ nvme_tcp_fail_request(queue->request);
+ nvme_tcp_done_send_req(queue);
+ return;
+ }
+
+ result = nvme_tcp_try_recv(queue);
+ if (result > 0)
+ pending = true;
+
+ if (!pending)
+ return;
+
+ } while (!time_after(jiffies, deadline)); /* quota is exhausted */
+
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+}
+
+static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
+{
+ struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
+
+ ahash_request_free(queue->rcv_hash);
+ ahash_request_free(queue->snd_hash);
+ crypto_free_ahash(tfm);
+}
+
+static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
+{
+ struct crypto_ahash *tfm;
+
+ tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+ if (!queue->snd_hash)
+ goto free_tfm;
+ ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
+
+ queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+ if (!queue->rcv_hash)
+ goto free_snd_hash;
+ ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
+
+ return 0;
+free_snd_hash:
+ ahash_request_free(queue->snd_hash);
+free_tfm:
+ crypto_free_ahash(tfm);
+ return -ENOMEM;
+}
+
+static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
+{
+ struct nvme_tcp_request *async = &ctrl->async_req;
+
+ page_frag_free(async->pdu);
+}
+
+static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
+{
+ struct nvme_tcp_queue *queue = &ctrl->queues[0];
+ struct nvme_tcp_request *async = &ctrl->async_req;
+ u8 hdgst = nvme_tcp_hdgst_len(queue);
+
+ async->pdu = page_frag_alloc(&queue->pf_cache,
+ sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!async->pdu)
+ return -ENOMEM;
+
+ async->queue = &ctrl->queues[0];
+ return 0;
+}
+
+static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+
+ if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
+ return;
+
+ if (queue->hdr_digest || queue->data_digest)
+ nvme_tcp_free_crypto(queue);
+
+ sock_release(queue->sock);
+ kfree(queue->pdu);
+}
+
+static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_icreq_pdu *icreq;
+ struct nvme_tcp_icresp_pdu *icresp;
+ struct msghdr msg = {};
+ struct kvec iov;
+ bool ctrl_hdgst, ctrl_ddgst;
+ int ret;
+
+ icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
+ if (!icreq)
+ return -ENOMEM;
+
+ icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
+ if (!icresp) {
+ ret = -ENOMEM;
+ goto free_icreq;
+ }
+
+ icreq->hdr.type = nvme_tcp_icreq;
+ icreq->hdr.hlen = sizeof(*icreq);
+ icreq->hdr.pdo = 0;
+ icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
+ icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
+ icreq->maxr2t = 0; /* single inflight r2t supported */
+ icreq->hpda = 0; /* no alignment constraint */
+ if (queue->hdr_digest)
+ icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
+ if (queue->data_digest)
+ icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
+
+ iov.iov_base = icreq;
+ iov.iov_len = sizeof(*icreq);
+ ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+ if (ret < 0)
+ goto free_icresp;
+
+ memset(&msg, 0, sizeof(msg));
+ iov.iov_base = icresp;
+ iov.iov_len = sizeof(*icresp);
+ ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+ iov.iov_len, msg.msg_flags);
+ if (ret < 0)
+ goto free_icresp;
+
+ ret = -EINVAL;
+ if (icresp->hdr.type != nvme_tcp_icresp) {
+ pr_err("queue %d: bad type returned %d\n",
+ nvme_tcp_queue_id(queue), icresp->hdr.type);
+ goto free_icresp;
+ }
+
+ if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
+ pr_err("queue %d: bad pdu length returned %d\n",
+ nvme_tcp_queue_id(queue), icresp->hdr.plen);
+ goto free_icresp;
+ }
+
+ if (icresp->pfv != NVME_TCP_PFV_1_0) {
+ pr_err("queue %d: bad pfv returned %d\n",
+ nvme_tcp_queue_id(queue), icresp->pfv);
+ goto free_icresp;
+ }
+
+ ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
+ if ((queue->data_digest && !ctrl_ddgst) ||
+ (!queue->data_digest && ctrl_ddgst)) {
+ pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
+ nvme_tcp_queue_id(queue),
+ queue->data_digest ? "enabled" : "disabled",
+ ctrl_ddgst ? "enabled" : "disabled");
+ goto free_icresp;
+ }
+
+ ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
+ if ((queue->hdr_digest && !ctrl_hdgst) ||
+ (!queue->hdr_digest && ctrl_hdgst)) {
+ pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
+ nvme_tcp_queue_id(queue),
+ queue->hdr_digest ? "enabled" : "disabled",
+ ctrl_hdgst ? "enabled" : "disabled");
+ goto free_icresp;
+ }
+
+ if (icresp->cpda != 0) {
+ pr_err("queue %d: unsupported cpda returned %d\n",
+ nvme_tcp_queue_id(queue), icresp->cpda);
+ goto free_icresp;
+ }
+
+ ret = 0;
+free_icresp:
+ kfree(icresp);
+free_icreq:
+ kfree(icreq);
+ return ret;
+}
+
+static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
+ int qid, size_t queue_size)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+ struct linger sol = { .l_onoff = 1, .l_linger = 0 };
+ int ret, opt, rcv_pdu_size, n;
+
+ queue->ctrl = ctrl;
+ INIT_LIST_HEAD(&queue->send_list);
+ spin_lock_init(&queue->lock);
+ INIT_WORK(&queue->io_work, nvme_tcp_io_work);
+ queue->queue_size = queue_size;
+
+ if (qid > 0)
+ queue->cmnd_capsule_len = nctrl->ioccsz * 16;
+ else
+ queue->cmnd_capsule_len = sizeof(struct nvme_command) +
+ NVME_TCP_ADMIN_CCSZ;
+
+ ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
+ IPPROTO_TCP, &queue->sock);
+ if (ret) {
+ dev_err(nctrl->device,
+ "failed to create socket: %d\n", ret);
+ return ret;
+ }
+
+ /* Single syn retry */
+ opt = 1;
+ ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
+ (char *)&opt, sizeof(opt));
+ if (ret) {
+ dev_err(nctrl->device,
+ "failed to set TCP_SYNCNT sock opt %d\n", ret);
+ goto err_sock;
+ }
+
+ /* Set TCP no delay */
+ opt = 1;
+ ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
+ TCP_NODELAY, (char *)&opt, sizeof(opt));
+ if (ret) {
+ dev_err(nctrl->device,
+ "failed to set TCP_NODELAY sock opt %d\n", ret);
+ goto err_sock;
+ }
+
+ /*
+ * Cleanup whatever is sitting in the TCP transmit queue on socket
+ * close. This is done to prevent stale data from being sent should
+ * the network connection be restored before TCP times out.
+ */
+ ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
+ (char *)&sol, sizeof(sol));
+ if (ret) {
+ dev_err(nctrl->device,
+ "failed to set SO_LINGER sock opt %d\n", ret);
+ goto err_sock;
+ }
+
+ /* Set socket type of service */
+ if (nctrl->opts->tos >= 0) {
+ opt = nctrl->opts->tos;
+ ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS,
+ (char *)&opt, sizeof(opt));
+ if (ret) {
+ dev_err(nctrl->device,
+ "failed to set IP_TOS sock opt %d\n", ret);
+ goto err_sock;
+ }
+ }
+
+ queue->sock->sk->sk_allocation = GFP_ATOMIC;
+ if (!qid)
+ n = 0;
+ else
+ n = (qid - 1) % num_online_cpus();
+ queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+ queue->request = NULL;
+ queue->data_remaining = 0;
+ queue->ddgst_remaining = 0;
+ queue->pdu_remaining = 0;
+ queue->pdu_offset = 0;
+ sk_set_memalloc(queue->sock->sk);
+
+ if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
+ ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
+ sizeof(ctrl->src_addr));
+ if (ret) {
+ dev_err(nctrl->device,
+ "failed to bind queue %d socket %d\n",
+ qid, ret);
+ goto err_sock;
+ }
+ }
+
+ queue->hdr_digest = nctrl->opts->hdr_digest;
+ queue->data_digest = nctrl->opts->data_digest;
+ if (queue->hdr_digest || queue->data_digest) {
+ ret = nvme_tcp_alloc_crypto(queue);
+ if (ret) {
+ dev_err(nctrl->device,
+ "failed to allocate queue %d crypto\n", qid);
+ goto err_sock;
+ }
+ }
+
+ rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
+ nvme_tcp_hdgst_len(queue);
+ queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
+ if (!queue->pdu) {
+ ret = -ENOMEM;
+ goto err_crypto;
+ }
+
+ dev_dbg(nctrl->device, "connecting queue %d\n",
+ nvme_tcp_queue_id(queue));
+
+ ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
+ sizeof(ctrl->addr), 0);
+ if (ret) {
+ dev_err(nctrl->device,
+ "failed to connect socket: %d\n", ret);
+ goto err_rcv_pdu;
+ }
+
+ ret = nvme_tcp_init_connection(queue);
+ if (ret)
+ goto err_init_connect;
+
+ queue->rd_enabled = true;
+ set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
+ nvme_tcp_init_recv_ctx(queue);
+
+ write_lock_bh(&queue->sock->sk->sk_callback_lock);
+ queue->sock->sk->sk_user_data = queue;
+ queue->state_change = queue->sock->sk->sk_state_change;
+ queue->data_ready = queue->sock->sk->sk_data_ready;
+ queue->write_space = queue->sock->sk->sk_write_space;
+ queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
+ queue->sock->sk->sk_state_change = nvme_tcp_state_change;
+ queue->sock->sk->sk_write_space = nvme_tcp_write_space;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ queue->sock->sk->sk_ll_usec = 1;
+#endif
+ write_unlock_bh(&queue->sock->sk->sk_callback_lock);
+
+ return 0;
+
+err_init_connect:
+ kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+err_rcv_pdu:
+ kfree(queue->pdu);
+err_crypto:
+ if (queue->hdr_digest || queue->data_digest)
+ nvme_tcp_free_crypto(queue);
+err_sock:
+ sock_release(queue->sock);
+ queue->sock = NULL;
+ return ret;
+}
+
+static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
+{
+ struct socket *sock = queue->sock;
+
+ write_lock_bh(&sock->sk->sk_callback_lock);
+ sock->sk->sk_user_data = NULL;
+ sock->sk->sk_data_ready = queue->data_ready;
+ sock->sk->sk_state_change = queue->state_change;
+ sock->sk->sk_write_space = queue->write_space;
+ write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
+{
+ kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+ nvme_tcp_restore_sock_calls(queue);
+ cancel_work_sync(&queue->io_work);
+}
+
+static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+
+ if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
+ return;
+
+ __nvme_tcp_stop_queue(queue);
+}
+
+static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ int ret;
+
+ if (idx)
+ ret = nvmf_connect_io_queue(nctrl, idx, false);
+ else
+ ret = nvmf_connect_admin_queue(nctrl);
+
+ if (!ret) {
+ set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
+ } else {
+ if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
+ __nvme_tcp_stop_queue(&ctrl->queues[idx]);
+ dev_err(nctrl->device,
+ "failed to connect queue: %d ret=%d\n", idx, ret);
+ }
+ return ret;
+}
+
+static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
+ bool admin)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ struct blk_mq_tag_set *set;
+ int ret;
+
+ if (admin) {
+ set = &ctrl->admin_tag_set;
+ memset(set, 0, sizeof(*set));
+ set->ops = &nvme_tcp_admin_mq_ops;
+ set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+ set->reserved_tags = 2; /* connect + keep-alive */
+ set->numa_node = NUMA_NO_NODE;
+ set->cmd_size = sizeof(struct nvme_tcp_request);
+ set->driver_data = ctrl;
+ set->nr_hw_queues = 1;
+ set->timeout = ADMIN_TIMEOUT;
+ } else {
+ set = &ctrl->tag_set;
+ memset(set, 0, sizeof(*set));
+ set->ops = &nvme_tcp_mq_ops;
+ set->queue_depth = nctrl->sqsize + 1;
+ set->reserved_tags = 1; /* fabric connect */
+ set->numa_node = NUMA_NO_NODE;
+ set->flags = BLK_MQ_F_SHOULD_MERGE;
+ set->cmd_size = sizeof(struct nvme_tcp_request);
+ set->driver_data = ctrl;
+ set->nr_hw_queues = nctrl->queue_count - 1;
+ set->timeout = NVME_IO_TIMEOUT;
+ set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
+ }
+
+ ret = blk_mq_alloc_tag_set(set);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return set;
+}
+
+static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
+{
+ if (to_tcp_ctrl(ctrl)->async_req.pdu) {
+ nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
+ to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
+ }
+
+ nvme_tcp_free_queue(ctrl, 0);
+}
+
+static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
+{
+ int i;
+
+ for (i = 1; i < ctrl->queue_count; i++)
+ nvme_tcp_free_queue(ctrl, i);
+}
+
+static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
+{
+ int i;
+
+ for (i = 1; i < ctrl->queue_count; i++)
+ nvme_tcp_stop_queue(ctrl, i);
+}
+
+static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
+{
+ int i, ret = 0;
+
+ for (i = 1; i < ctrl->queue_count; i++) {
+ ret = nvme_tcp_start_queue(ctrl, i);
+ if (ret)
+ goto out_stop_queues;
+ }
+
+ return 0;
+
+out_stop_queues:
+ for (i--; i >= 1; i--)
+ nvme_tcp_stop_queue(ctrl, i);
+ return ret;
+}
+
+static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
+{
+ int ret;
+
+ ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
+ if (ret)
+ return ret;
+
+ ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
+ if (ret)
+ goto out_free_queue;
+
+ return 0;
+
+out_free_queue:
+ nvme_tcp_free_queue(ctrl, 0);
+ return ret;
+}
+
+static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
+{
+ int i, ret;
+
+ for (i = 1; i < ctrl->queue_count; i++) {
+ ret = nvme_tcp_alloc_queue(ctrl, i,
+ ctrl->sqsize + 1);
+ if (ret)
+ goto out_free_queues;
+ }
+
+ return 0;
+
+out_free_queues:
+ for (i--; i >= 1; i--)
+ nvme_tcp_free_queue(ctrl, i);
+
+ return ret;
+}
+
+static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
+{
+ unsigned int nr_io_queues;
+
+ nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
+ nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
+ nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
+
+ return nr_io_queues;
+}
+
+static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
+ unsigned int nr_io_queues)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+ struct nvmf_ctrl_options *opts = nctrl->opts;
+
+ if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
+ /*
+ * separate read/write queues
+ * hand out dedicated default queues only after we have
+ * sufficient read queues.
+ */
+ ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
+ min(opts->nr_write_queues, nr_io_queues);
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ } else {
+ /*
+ * shared read/write queues
+ * either no write queues were requested, or we don't have
+ * sufficient queue count to have dedicated default queues.
+ */
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
+ min(opts->nr_io_queues, nr_io_queues);
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ }
+
+ if (opts->nr_poll_queues && nr_io_queues) {
+ /* map dedicated poll queues only if we have queues left */
+ ctrl->io_queues[HCTX_TYPE_POLL] =
+ min(opts->nr_poll_queues, nr_io_queues);
+ }
+}
+
+static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
+{
+ unsigned int nr_io_queues;
+ int ret;
+
+ nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
+ ret = nvme_set_queue_count(ctrl, &nr_io_queues);
+ if (ret)
+ return ret;
+
+ ctrl->queue_count = nr_io_queues + 1;
+ if (ctrl->queue_count < 2)
+ return 0;
+
+ dev_info(ctrl->device,
+ "creating %d I/O queues.\n", nr_io_queues);
+
+ nvme_tcp_set_io_queues(ctrl, nr_io_queues);
+
+ return __nvme_tcp_alloc_io_queues(ctrl);
+}
+
+static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
+{
+ nvme_tcp_stop_io_queues(ctrl);
+ if (remove) {
+ blk_cleanup_queue(ctrl->connect_q);
+ blk_mq_free_tag_set(ctrl->tagset);
+ }
+ nvme_tcp_free_io_queues(ctrl);
+}
+
+static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
+{
+ int ret;
+
+ ret = nvme_tcp_alloc_io_queues(ctrl);
+ if (ret)
+ return ret;
+
+ if (new) {
+ ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
+ if (IS_ERR(ctrl->tagset)) {
+ ret = PTR_ERR(ctrl->tagset);
+ goto out_free_io_queues;
+ }
+
+ ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
+ if (IS_ERR(ctrl->connect_q)) {
+ ret = PTR_ERR(ctrl->connect_q);
+ goto out_free_tag_set;
+ }
+ } else {
+ blk_mq_update_nr_hw_queues(ctrl->tagset,
+ ctrl->queue_count - 1);
+ }
+
+ ret = nvme_tcp_start_io_queues(ctrl);
+ if (ret)
+ goto out_cleanup_connect_q;
+
+ return 0;
+
+out_cleanup_connect_q:
+ if (new)
+ blk_cleanup_queue(ctrl->connect_q);
+out_free_tag_set:
+ if (new)
+ blk_mq_free_tag_set(ctrl->tagset);
+out_free_io_queues:
+ nvme_tcp_free_io_queues(ctrl);
+ return ret;
+}
+
+static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
+{
+ nvme_tcp_stop_queue(ctrl, 0);
+ if (remove) {
+ blk_cleanup_queue(ctrl->admin_q);
+ blk_cleanup_queue(ctrl->fabrics_q);
+ blk_mq_free_tag_set(ctrl->admin_tagset);
+ }
+ nvme_tcp_free_admin_queue(ctrl);
+}
+
+static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
+{
+ int error;
+
+ error = nvme_tcp_alloc_admin_queue(ctrl);
+ if (error)
+ return error;
+
+ if (new) {
+ ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
+ if (IS_ERR(ctrl->admin_tagset)) {
+ error = PTR_ERR(ctrl->admin_tagset);
+ goto out_free_queue;
+ }
+
+ ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
+ if (IS_ERR(ctrl->fabrics_q)) {
+ error = PTR_ERR(ctrl->fabrics_q);
+ goto out_free_tagset;
+ }
+
+ ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
+ if (IS_ERR(ctrl->admin_q)) {
+ error = PTR_ERR(ctrl->admin_q);
+ goto out_cleanup_fabrics_q;
+ }
+ }
+
+ error = nvme_tcp_start_queue(ctrl, 0);
+ if (error)
+ goto out_cleanup_queue;
+
+ error = nvme_enable_ctrl(ctrl);
+ if (error)
+ goto out_stop_queue;
+
+ blk_mq_unquiesce_queue(ctrl->admin_q);
+
+ error = nvme_init_identify(ctrl);
+ if (error)
+ goto out_stop_queue;
+
+ return 0;
+
+out_stop_queue:
+ nvme_tcp_stop_queue(ctrl, 0);
+out_cleanup_queue:
+ if (new)
+ blk_cleanup_queue(ctrl->admin_q);
+out_cleanup_fabrics_q:
+ if (new)
+ blk_cleanup_queue(ctrl->fabrics_q);
+out_free_tagset:
+ if (new)
+ blk_mq_free_tag_set(ctrl->admin_tagset);
+out_free_queue:
+ nvme_tcp_free_admin_queue(ctrl);
+ return error;
+}
+
+static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
+ bool remove)
+{
+ blk_mq_quiesce_queue(ctrl->admin_q);
+ nvme_tcp_stop_queue(ctrl, 0);
+ if (ctrl->admin_tagset) {
+ blk_mq_tagset_busy_iter(ctrl->admin_tagset,
+ nvme_cancel_request, ctrl);
+ blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
+ }
+ if (remove)
+ blk_mq_unquiesce_queue(ctrl->admin_q);
+ nvme_tcp_destroy_admin_queue(ctrl, remove);
+}
+
+static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
+ bool remove)
+{
+ if (ctrl->queue_count <= 1)
+ return;
+ nvme_stop_queues(ctrl);
+ nvme_tcp_stop_io_queues(ctrl);
+ if (ctrl->tagset) {
+ blk_mq_tagset_busy_iter(ctrl->tagset,
+ nvme_cancel_request, ctrl);
+ blk_mq_tagset_wait_completed_request(ctrl->tagset);
+ }
+ if (remove)
+ nvme_start_queues(ctrl);
+ nvme_tcp_destroy_io_queues(ctrl, remove);
+}
+
+static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
+{
+ /* If we are resetting/deleting then do nothing */
+ if (ctrl->state != NVME_CTRL_CONNECTING) {
+ WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
+ ctrl->state == NVME_CTRL_LIVE);
+ return;
+ }
+
+ if (nvmf_should_reconnect(ctrl)) {
+ dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
+ ctrl->opts->reconnect_delay);
+ queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
+ ctrl->opts->reconnect_delay * HZ);
+ } else {
+ dev_info(ctrl->device, "Removing controller...\n");
+ nvme_delete_ctrl(ctrl);
+ }
+}
+
+static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
+{
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+ int ret;
+
+ ret = nvme_tcp_configure_admin_queue(ctrl, new);
+ if (ret)
+ return ret;
+
+ if (ctrl->icdoff) {
+ dev_err(ctrl->device, "icdoff is not supported!\n");
+ goto destroy_admin;
+ }
+
+ if (opts->queue_size > ctrl->sqsize + 1)
+ dev_warn(ctrl->device,
+ "queue_size %zu > ctrl sqsize %u, clamping down\n",
+ opts->queue_size, ctrl->sqsize + 1);
+
+ if (ctrl->sqsize + 1 > ctrl->maxcmd) {
+ dev_warn(ctrl->device,
+ "sqsize %u > ctrl maxcmd %u, clamping down\n",
+ ctrl->sqsize + 1, ctrl->maxcmd);
+ ctrl->sqsize = ctrl->maxcmd - 1;
+ }
+
+ if (ctrl->queue_count > 1) {
+ ret = nvme_tcp_configure_io_queues(ctrl, new);
+ if (ret)
+ goto destroy_admin;
+ }
+
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
+ /* state change failure is ok if we're in DELETING state */
+ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+ ret = -EINVAL;
+ goto destroy_io;
+ }
+
+ nvme_start_ctrl(ctrl);
+ return 0;
+
+destroy_io:
+ if (ctrl->queue_count > 1)
+ nvme_tcp_destroy_io_queues(ctrl, new);
+destroy_admin:
+ nvme_tcp_stop_queue(ctrl, 0);
+ nvme_tcp_destroy_admin_queue(ctrl, new);
+ return ret;
+}
+
+static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
+{
+ struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
+ struct nvme_tcp_ctrl, connect_work);
+ struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+
+ ++ctrl->nr_reconnects;
+
+ if (nvme_tcp_setup_ctrl(ctrl, false))
+ goto requeue;
+
+ dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
+ ctrl->nr_reconnects);
+
+ ctrl->nr_reconnects = 0;
+
+ return;
+
+requeue:
+ dev_info(ctrl->device, "Failed reconnect attempt %d\n",
+ ctrl->nr_reconnects);
+ nvme_tcp_reconnect_or_remove(ctrl);
+}
+
+static void nvme_tcp_error_recovery_work(struct work_struct *work)
+{
+ struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
+ struct nvme_tcp_ctrl, err_work);
+ struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+
+ nvme_stop_keep_alive(ctrl);
+ nvme_tcp_teardown_io_queues(ctrl, false);
+ /* unquiesce to fail fast pending requests */
+ nvme_start_queues(ctrl);
+ nvme_tcp_teardown_admin_queue(ctrl, false);
+ blk_mq_unquiesce_queue(ctrl->admin_q);
+
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
+ /* state change failure is ok if we're in DELETING state */
+ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+ return;
+ }
+
+ nvme_tcp_reconnect_or_remove(ctrl);
+}
+
+static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
+{
+ cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
+ cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
+
+ nvme_tcp_teardown_io_queues(ctrl, shutdown);
+ blk_mq_quiesce_queue(ctrl->admin_q);
+ if (shutdown)
+ nvme_shutdown_ctrl(ctrl);
+ else
+ nvme_disable_ctrl(ctrl);
+ nvme_tcp_teardown_admin_queue(ctrl, shutdown);
+}
+
+static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
+{
+ nvme_tcp_teardown_ctrl(ctrl, true);
+}
+
+static void nvme_reset_ctrl_work(struct work_struct *work)
+{
+ struct nvme_ctrl *ctrl =
+ container_of(work, struct nvme_ctrl, reset_work);
+
+ nvme_stop_ctrl(ctrl);
+ nvme_tcp_teardown_ctrl(ctrl, false);
+
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
+ /* state change failure is ok if we're in DELETING state */
+ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+ return;
+ }
+
+ if (nvme_tcp_setup_ctrl(ctrl, false))
+ goto out_fail;
+
+ return;
+
+out_fail:
+ ++ctrl->nr_reconnects;
+ nvme_tcp_reconnect_or_remove(ctrl);
+}
+
+static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+
+ if (list_empty(&ctrl->list))
+ goto free_ctrl;
+
+ mutex_lock(&nvme_tcp_ctrl_mutex);
+ list_del(&ctrl->list);
+ mutex_unlock(&nvme_tcp_ctrl_mutex);
+
+ nvmf_free_options(nctrl->opts);
+free_ctrl:
+ kfree(ctrl->queues);
+ kfree(ctrl);
+}
+
+static void nvme_tcp_set_sg_null(struct nvme_command *c)
+{
+ struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+ sg->addr = 0;
+ sg->length = 0;
+ sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
+ NVME_SGL_FMT_TRANSPORT_A;
+}
+
+static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
+ struct nvme_command *c, u32 data_len)
+{
+ struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+ sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
+ sg->length = cpu_to_le32(data_len);
+ sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
+}
+
+static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
+ u32 data_len)
+{
+ struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+ sg->addr = 0;
+ sg->length = cpu_to_le32(data_len);
+ sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
+ NVME_SGL_FMT_TRANSPORT_A;
+}
+
+static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
+{
+ struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
+ struct nvme_tcp_queue *queue = &ctrl->queues[0];
+ struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
+ struct nvme_command *cmd = &pdu->cmd;
+ u8 hdgst = nvme_tcp_hdgst_len(queue);
+
+ memset(pdu, 0, sizeof(*pdu));
+ pdu->hdr.type = nvme_tcp_cmd;
+ if (queue->hdr_digest)
+ pdu->hdr.flags |= NVME_TCP_F_HDGST;
+ pdu->hdr.hlen = sizeof(*pdu);
+ pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
+
+ cmd->common.opcode = nvme_admin_async_event;
+ cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
+ cmd->common.flags |= NVME_CMD_SGL_METABUF;
+ nvme_tcp_set_sg_null(cmd);
+
+ ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
+ ctrl->async_req.offset = 0;
+ ctrl->async_req.curr_bio = NULL;
+ ctrl->async_req.data_len = 0;
+
+ nvme_tcp_queue_request(&ctrl->async_req);
+}
+
+static enum blk_eh_timer_return
+nvme_tcp_timeout(struct request *rq, bool reserved)
+{
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
+ struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+
+ /*
+ * Restart the timer if a controller reset is already scheduled. Any
+ * timed out commands would be handled before entering the connecting
+ * state.
+ */
+ if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
+ return BLK_EH_RESET_TIMER;
+
+ dev_warn(ctrl->ctrl.device,
+ "queue %d: timeout request %#x type %d\n",
+ nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
+
+ if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
+ /*
+ * Teardown immediately if controller times out while starting
+ * or we are already started error recovery. all outstanding
+ * requests are completed on shutdown, so we return BLK_EH_DONE.
+ */
+ flush_work(&ctrl->err_work);
+ nvme_tcp_teardown_io_queues(&ctrl->ctrl, false);
+ nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false);
+ return BLK_EH_DONE;
+ }
+
+ dev_warn(ctrl->ctrl.device, "starting error recovery\n");
+ nvme_tcp_error_recovery(&ctrl->ctrl);
+
+ return BLK_EH_RESET_TIMER;
+}
+
+static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
+ struct request *rq)
+{
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+ struct nvme_command *c = &pdu->cmd;
+
+ c->common.flags |= NVME_CMD_SGL_METABUF;
+
+ if (rq_data_dir(rq) == WRITE && req->data_len &&
+ req->data_len <= nvme_tcp_inline_data_size(queue))
+ nvme_tcp_set_sg_inline(queue, c, req->data_len);
+ else
+ nvme_tcp_set_sg_host_data(c, req->data_len);
+
+ return 0;
+}
+
+static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
+ struct request *rq)
+{
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+ struct nvme_tcp_queue *queue = req->queue;
+ u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
+ blk_status_t ret;
+
+ ret = nvme_setup_cmd(ns, rq, &pdu->cmd);
+ if (ret)
+ return ret;
+
+ req->state = NVME_TCP_SEND_CMD_PDU;
+ req->offset = 0;
+ req->data_sent = 0;
+ req->pdu_len = 0;
+ req->pdu_sent = 0;
+ req->data_len = blk_rq_payload_bytes(rq);
+ req->curr_bio = rq->bio;
+
+ if (rq_data_dir(rq) == WRITE &&
+ req->data_len <= nvme_tcp_inline_data_size(queue))
+ req->pdu_len = req->data_len;
+ else if (req->curr_bio)
+ nvme_tcp_init_iter(req, READ);
+
+ pdu->hdr.type = nvme_tcp_cmd;
+ pdu->hdr.flags = 0;
+ if (queue->hdr_digest)
+ pdu->hdr.flags |= NVME_TCP_F_HDGST;
+ if (queue->data_digest && req->pdu_len) {
+ pdu->hdr.flags |= NVME_TCP_F_DDGST;
+ ddgst = nvme_tcp_ddgst_len(queue);
+ }
+ pdu->hdr.hlen = sizeof(*pdu);
+ pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
+ pdu->hdr.plen =
+ cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
+
+ ret = nvme_tcp_map_data(queue, rq);
+ if (unlikely(ret)) {
+ nvme_cleanup_cmd(rq);
+ dev_err(queue->ctrl->ctrl.device,
+ "Failed to map data (%d)\n", ret);
+ return ret;
+ }
+
+ return 0;
+}
+
+static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct nvme_ns *ns = hctx->queue->queuedata;
+ struct nvme_tcp_queue *queue = hctx->driver_data;
+ struct request *rq = bd->rq;
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+ bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
+ blk_status_t ret;
+
+ if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
+ return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
+
+ ret = nvme_tcp_setup_cmd_pdu(ns, rq);
+ if (unlikely(ret))
+ return ret;
+
+ blk_mq_start_request(rq);
+
+ nvme_tcp_queue_request(req);
+
+ return BLK_STS_OK;
+}
+
+static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
+{
+ struct nvme_tcp_ctrl *ctrl = set->driver_data;
+ struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
+
+ if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
+ /* separate read/write queues */
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+ set->map[HCTX_TYPE_READ].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_READ];
+ set->map[HCTX_TYPE_READ].queue_offset =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ } else {
+ /* shared read/write queues */
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+ set->map[HCTX_TYPE_READ].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ set->map[HCTX_TYPE_READ].queue_offset = 0;
+ }
+ blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
+ blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
+
+ if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
+ /* map dedicated poll queues only if we have queues left */
+ set->map[HCTX_TYPE_POLL].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_POLL];
+ set->map[HCTX_TYPE_POLL].queue_offset =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+ ctrl->io_queues[HCTX_TYPE_READ];
+ blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
+ }
+
+ dev_info(ctrl->ctrl.device,
+ "mapped %d/%d/%d default/read/poll queues.\n",
+ ctrl->io_queues[HCTX_TYPE_DEFAULT],
+ ctrl->io_queues[HCTX_TYPE_READ],
+ ctrl->io_queues[HCTX_TYPE_POLL]);
+
+ return 0;
+}
+
+static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
+{
+ struct nvme_tcp_queue *queue = hctx->driver_data;
+ struct sock *sk = queue->sock->sk;
+
+ if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
+ sk_busy_loop(sk, true);
+ nvme_tcp_try_recv(queue);
+ return queue->nr_cqe;
+}
+
+static struct blk_mq_ops nvme_tcp_mq_ops = {
+ .queue_rq = nvme_tcp_queue_rq,
+ .complete = nvme_complete_rq,
+ .init_request = nvme_tcp_init_request,
+ .exit_request = nvme_tcp_exit_request,
+ .init_hctx = nvme_tcp_init_hctx,
+ .timeout = nvme_tcp_timeout,
+ .map_queues = nvme_tcp_map_queues,
+ .poll = nvme_tcp_poll,
+};
+
+static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
+ .queue_rq = nvme_tcp_queue_rq,
+ .complete = nvme_complete_rq,
+ .init_request = nvme_tcp_init_request,
+ .exit_request = nvme_tcp_exit_request,
+ .init_hctx = nvme_tcp_init_admin_hctx,
+ .timeout = nvme_tcp_timeout,
+};
+
+static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
+ .name = "tcp",
+ .module = THIS_MODULE,
+ .flags = NVME_F_FABRICS,
+ .reg_read32 = nvmf_reg_read32,
+ .reg_read64 = nvmf_reg_read64,
+ .reg_write32 = nvmf_reg_write32,
+ .free_ctrl = nvme_tcp_free_ctrl,
+ .submit_async_event = nvme_tcp_submit_async_event,
+ .delete_ctrl = nvme_tcp_delete_ctrl,
+ .get_address = nvmf_get_address,
+};
+
+static bool
+nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
+{
+ struct nvme_tcp_ctrl *ctrl;
+ bool found = false;
+
+ mutex_lock(&nvme_tcp_ctrl_mutex);
+ list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
+ found = nvmf_ip_options_match(&ctrl->ctrl, opts);
+ if (found)
+ break;
+ }
+ mutex_unlock(&nvme_tcp_ctrl_mutex);
+
+ return found;
+}
+
+static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
+ struct nvmf_ctrl_options *opts)
+{
+ struct nvme_tcp_ctrl *ctrl;
+ int ret;
+
+ ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
+ if (!ctrl)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&ctrl->list);
+ ctrl->ctrl.opts = opts;
+ ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
+ opts->nr_poll_queues + 1;
+ ctrl->ctrl.sqsize = opts->queue_size - 1;
+ ctrl->ctrl.kato = opts->kato;
+
+ INIT_DELAYED_WORK(&ctrl->connect_work,
+ nvme_tcp_reconnect_ctrl_work);
+ INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
+ INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
+
+ if (!(opts->mask & NVMF_OPT_TRSVCID)) {
+ opts->trsvcid =
+ kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
+ if (!opts->trsvcid) {
+ ret = -ENOMEM;
+ goto out_free_ctrl;
+ }
+ opts->mask |= NVMF_OPT_TRSVCID;
+ }
+
+ ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+ opts->traddr, opts->trsvcid, &ctrl->addr);
+ if (ret) {
+ pr_err("malformed address passed: %s:%s\n",
+ opts->traddr, opts->trsvcid);
+ goto out_free_ctrl;
+ }
+
+ if (opts->mask & NVMF_OPT_HOST_TRADDR) {
+ ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+ opts->host_traddr, NULL, &ctrl->src_addr);
+ if (ret) {
+ pr_err("malformed src address passed: %s\n",
+ opts->host_traddr);
+ goto out_free_ctrl;
+ }
+ }
+
+ if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
+ ret = -EALREADY;
+ goto out_free_ctrl;
+ }
+
+ ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
+ GFP_KERNEL);
+ if (!ctrl->queues) {
+ ret = -ENOMEM;
+ goto out_free_ctrl;
+ }
+
+ ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
+ if (ret)
+ goto out_kfree_queues;
+
+ if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
+ WARN_ON_ONCE(1);
+ ret = -EINTR;
+ goto out_uninit_ctrl;
+ }
+
+ ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
+ if (ret)
+ goto out_uninit_ctrl;
+
+ dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
+ ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
+
+ nvme_get_ctrl(&ctrl->ctrl);
+
+ mutex_lock(&nvme_tcp_ctrl_mutex);
+ list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
+ mutex_unlock(&nvme_tcp_ctrl_mutex);
+
+ return &ctrl->ctrl;
+
+out_uninit_ctrl:
+ nvme_uninit_ctrl(&ctrl->ctrl);
+ nvme_put_ctrl(&ctrl->ctrl);
+ if (ret > 0)
+ ret = -EIO;
+ return ERR_PTR(ret);
+out_kfree_queues:
+ kfree(ctrl->queues);
+out_free_ctrl:
+ kfree(ctrl);
+ return ERR_PTR(ret);
+}
+
+static struct nvmf_transport_ops nvme_tcp_transport = {
+ .name = "tcp",
+ .module = THIS_MODULE,
+ .required_opts = NVMF_OPT_TRADDR,
+ .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
+ NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
+ NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
+ NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
+ NVMF_OPT_TOS,
+ .create_ctrl = nvme_tcp_create_ctrl,
+};
+
+static int __init nvme_tcp_init_module(void)
+{
+ nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+ if (!nvme_tcp_wq)
+ return -ENOMEM;
+
+ nvmf_register_transport(&nvme_tcp_transport);
+ return 0;
+}
+
+static void __exit nvme_tcp_cleanup_module(void)
+{
+ struct nvme_tcp_ctrl *ctrl;
+
+ nvmf_unregister_transport(&nvme_tcp_transport);
+
+ mutex_lock(&nvme_tcp_ctrl_mutex);
+ list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
+ nvme_delete_ctrl(&ctrl->ctrl);
+ mutex_unlock(&nvme_tcp_ctrl_mutex);
+ flush_workqueue(nvme_delete_wq);
+
+ destroy_workqueue(nvme_tcp_wq);
+}
+
+module_init(nvme_tcp_init_module);
+module_exit(nvme_tcp_cleanup_module);
+
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
index 25b0e31..5c3cb69 100644
--- a/drivers/nvme/host/trace.c
+++ b/drivers/nvme/host/trace.c
@@ -1,20 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* NVM Express device driver tracepoints
* Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
*/
#include <asm/unaligned.h>
#include "trace.h"
+static const char *nvme_trace_delete_sq(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u16 sqid = get_unaligned_le16(cdw10);
+
+ trace_seq_printf(p, "sqid=%u", sqid);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
@@ -31,6 +34,17 @@
return ret;
}
+static const char *nvme_trace_delete_cq(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u16 cqid = get_unaligned_le16(cdw10);
+
+ trace_seq_printf(p, "cqid=%u", cqid);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10)
{
const char *ret = trace_seq_buffer_ptr(p);
@@ -58,7 +72,35 @@
return ret;
}
+static const char *nvme_trace_admin_get_features(struct trace_seq *p,
+ u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u8 fid = cdw10[0];
+ u8 sel = cdw10[1] & 0x7;
+ u32 cdw11 = get_unaligned_le32(cdw10 + 4);
+ trace_seq_printf(p, "fid=0x%x sel=0x%x cdw11=0x%x", fid, sel, cdw11);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char *nvme_trace_get_lba_status(struct trace_seq *p,
+ u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u64 slba = get_unaligned_le64(cdw10);
+ u32 mndw = get_unaligned_le32(cdw10 + 8);
+ u16 rl = get_unaligned_le16(cdw10 + 12);
+ u8 atype = cdw10[15];
+
+ trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u",
+ slba, mndw, rl, atype);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10)
{
@@ -103,12 +145,20 @@
u8 opcode, u8 *cdw10)
{
switch (opcode) {
+ case nvme_admin_delete_sq:
+ return nvme_trace_delete_sq(p, cdw10);
case nvme_admin_create_sq:
return nvme_trace_create_sq(p, cdw10);
+ case nvme_admin_delete_cq:
+ return nvme_trace_delete_cq(p, cdw10);
case nvme_admin_create_cq:
return nvme_trace_create_cq(p, cdw10);
case nvme_admin_identify:
return nvme_trace_admin_identify(p, cdw10);
+ case nvme_admin_get_features:
+ return nvme_trace_admin_get_features(p, cdw10);
+ case nvme_admin_get_lba_status:
+ return nvme_trace_get_lba_status(p, cdw10);
default:
return nvme_trace_common(p, cdw10);
}
@@ -129,6 +179,69 @@
}
}
+static const char *nvme_trace_fabrics_property_set(struct trace_seq *p, u8 *spc)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u8 attrib = spc[0];
+ u32 ofst = get_unaligned_le32(spc + 4);
+ u64 value = get_unaligned_le64(spc + 8);
+
+ trace_seq_printf(p, "attrib=%u, ofst=0x%x, value=0x%llx",
+ attrib, ofst, value);
+ trace_seq_putc(p, 0);
+ return ret;
+}
+
+static const char *nvme_trace_fabrics_connect(struct trace_seq *p, u8 *spc)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u16 recfmt = get_unaligned_le16(spc);
+ u16 qid = get_unaligned_le16(spc + 2);
+ u16 sqsize = get_unaligned_le16(spc + 4);
+ u8 cattr = spc[6];
+ u32 kato = get_unaligned_le32(spc + 8);
+
+ trace_seq_printf(p, "recfmt=%u, qid=%u, sqsize=%u, cattr=%u, kato=%u",
+ recfmt, qid, sqsize, cattr, kato);
+ trace_seq_putc(p, 0);
+ return ret;
+}
+
+static const char *nvme_trace_fabrics_property_get(struct trace_seq *p, u8 *spc)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u8 attrib = spc[0];
+ u32 ofst = get_unaligned_le32(spc + 4);
+
+ trace_seq_printf(p, "attrib=%u, ofst=0x%x", attrib, ofst);
+ trace_seq_putc(p, 0);
+ return ret;
+}
+
+static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ trace_seq_printf(p, "specific=%*ph", 24, spc);
+ trace_seq_putc(p, 0);
+ return ret;
+}
+
+const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p,
+ u8 fctype, u8 *spc)
+{
+ switch (fctype) {
+ case nvme_fabrics_type_property_set:
+ return nvme_trace_fabrics_property_set(p, spc);
+ case nvme_fabrics_type_connect:
+ return nvme_trace_fabrics_connect(p, spc);
+ case nvme_fabrics_type_property_get:
+ return nvme_trace_fabrics_property_get(p, spc);
+ default:
+ return nvme_trace_fabrics_common(p, spc);
+ }
+}
+
const char *nvme_trace_disk_name(struct trace_seq *p, char *name)
{
const char *ret = trace_seq_buffer_ptr(p);
@@ -139,3 +252,5 @@
return ret;
}
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(nvme_sq);
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index a490790..daaf700 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -1,15 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/*
* NVM Express device driver tracepoints
* Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
*/
#undef TRACE_SYSTEM
@@ -24,59 +16,19 @@
#include "nvme.h"
-#define nvme_admin_opcode_name(opcode) { opcode, #opcode }
-#define show_admin_opcode_name(val) \
- __print_symbolic(val, \
- nvme_admin_opcode_name(nvme_admin_delete_sq), \
- nvme_admin_opcode_name(nvme_admin_create_sq), \
- nvme_admin_opcode_name(nvme_admin_get_log_page), \
- nvme_admin_opcode_name(nvme_admin_delete_cq), \
- nvme_admin_opcode_name(nvme_admin_create_cq), \
- nvme_admin_opcode_name(nvme_admin_identify), \
- nvme_admin_opcode_name(nvme_admin_abort_cmd), \
- nvme_admin_opcode_name(nvme_admin_set_features), \
- nvme_admin_opcode_name(nvme_admin_get_features), \
- nvme_admin_opcode_name(nvme_admin_async_event), \
- nvme_admin_opcode_name(nvme_admin_ns_mgmt), \
- nvme_admin_opcode_name(nvme_admin_activate_fw), \
- nvme_admin_opcode_name(nvme_admin_download_fw), \
- nvme_admin_opcode_name(nvme_admin_ns_attach), \
- nvme_admin_opcode_name(nvme_admin_keep_alive), \
- nvme_admin_opcode_name(nvme_admin_directive_send), \
- nvme_admin_opcode_name(nvme_admin_directive_recv), \
- nvme_admin_opcode_name(nvme_admin_dbbuf), \
- nvme_admin_opcode_name(nvme_admin_format_nvm), \
- nvme_admin_opcode_name(nvme_admin_security_send), \
- nvme_admin_opcode_name(nvme_admin_security_recv), \
- nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
-
-#define nvme_opcode_name(opcode) { opcode, #opcode }
-#define show_nvm_opcode_name(val) \
- __print_symbolic(val, \
- nvme_opcode_name(nvme_cmd_flush), \
- nvme_opcode_name(nvme_cmd_write), \
- nvme_opcode_name(nvme_cmd_read), \
- nvme_opcode_name(nvme_cmd_write_uncor), \
- nvme_opcode_name(nvme_cmd_compare), \
- nvme_opcode_name(nvme_cmd_write_zeroes), \
- nvme_opcode_name(nvme_cmd_dsm), \
- nvme_opcode_name(nvme_cmd_resv_register), \
- nvme_opcode_name(nvme_cmd_resv_report), \
- nvme_opcode_name(nvme_cmd_resv_acquire), \
- nvme_opcode_name(nvme_cmd_resv_release))
-
-#define show_opcode_name(qid, opcode) \
- (qid ? show_nvm_opcode_name(opcode) : show_admin_opcode_name(opcode))
-
const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
u8 *cdw10);
const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
u8 *cdw10);
+const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p, u8 fctype,
+ u8 *spc);
-#define parse_nvme_cmd(qid, opcode, cdw10) \
- (qid ? \
- nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : \
- nvme_trace_parse_admin_cmd(p, opcode, cdw10))
+#define parse_nvme_cmd(qid, opcode, fctype, cdw10) \
+ ((opcode) == nvme_fabrics_command ? \
+ nvme_trace_parse_fabrics_cmd(p, fctype, cdw10) : \
+ ((qid) ? \
+ nvme_trace_parse_nvm_cmd(p, opcode, cdw10) : \
+ nvme_trace_parse_admin_cmd(p, opcode, cdw10)))
const char *nvme_trace_disk_name(struct trace_seq *p, char *name);
#define __print_disk_name(name) \
@@ -101,6 +53,7 @@
__field(int, qid)
__field(u8, opcode)
__field(u8, flags)
+ __field(u8, fctype)
__field(u16, cid)
__field(u32, nsid)
__field(u64, metadata)
@@ -114,16 +67,19 @@
__entry->cid = cmd->common.command_id;
__entry->nsid = le32_to_cpu(cmd->common.nsid);
__entry->metadata = le64_to_cpu(cmd->common.metadata);
+ __entry->fctype = cmd->fabrics.fctype;
__assign_disk_name(__entry->disk, req->rq_disk);
- memcpy(__entry->cdw10, cmd->common.cdw10,
- sizeof(__entry->cdw10));
+ memcpy(__entry->cdw10, &cmd->common.cdw10,
+ sizeof(__entry->cdw10));
),
TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
__entry->ctrl_id, __print_disk_name(__entry->disk),
__entry->qid, __entry->cid, __entry->nsid,
__entry->flags, __entry->metadata,
- show_opcode_name(__entry->qid, __entry->opcode),
- parse_nvme_cmd(__entry->qid, __entry->opcode, __entry->cdw10))
+ show_opcode_name(__entry->qid, __entry->opcode,
+ __entry->fctype),
+ parse_nvme_cmd(__entry->qid, __entry->opcode,
+ __entry->fctype, __entry->cdw10))
);
TRACE_EVENT(nvme_complete_rq,
@@ -149,13 +105,65 @@
__entry->status = nvme_req(req)->status;
__assign_disk_name(__entry->disk, req->rq_disk);
),
- TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%llu, retries=%u, flags=0x%x, status=%u",
+ TP_printk("nvme%d: %sqid=%d, cmdid=%u, res=%#llx, retries=%u, flags=0x%x, status=%#x",
__entry->ctrl_id, __print_disk_name(__entry->disk),
__entry->qid, __entry->cid, __entry->result,
__entry->retries, __entry->flags, __entry->status)
);
+#define aer_name(aer) { aer, #aer }
+
+TRACE_EVENT(nvme_async_event,
+ TP_PROTO(struct nvme_ctrl *ctrl, u32 result),
+ TP_ARGS(ctrl, result),
+ TP_STRUCT__entry(
+ __field(int, ctrl_id)
+ __field(u32, result)
+ ),
+ TP_fast_assign(
+ __entry->ctrl_id = ctrl->instance;
+ __entry->result = result;
+ ),
+ TP_printk("nvme%d: NVME_AEN=%#08x [%s]",
+ __entry->ctrl_id, __entry->result,
+ __print_symbolic(__entry->result,
+ aer_name(NVME_AER_NOTICE_NS_CHANGED),
+ aer_name(NVME_AER_NOTICE_ANA),
+ aer_name(NVME_AER_NOTICE_FW_ACT_STARTING),
+ aer_name(NVME_AER_NOTICE_DISC_CHANGED),
+ aer_name(NVME_AER_ERROR),
+ aer_name(NVME_AER_SMART),
+ aer_name(NVME_AER_CSS),
+ aer_name(NVME_AER_VS))
+ )
+);
+
+#undef aer_name
+
+TRACE_EVENT(nvme_sq,
+ TP_PROTO(struct request *req, __le16 sq_head, int sq_tail),
+ TP_ARGS(req, sq_head, sq_tail),
+ TP_STRUCT__entry(
+ __field(int, ctrl_id)
+ __array(char, disk, DISK_NAME_LEN)
+ __field(int, qid)
+ __field(u16, sq_head)
+ __field(u16, sq_tail)
+ ),
+ TP_fast_assign(
+ __entry->ctrl_id = nvme_req(req)->ctrl->instance;
+ __assign_disk_name(__entry->disk, req->rq_disk);
+ __entry->qid = nvme_req_qid(req);
+ __entry->sq_head = le16_to_cpu(sq_head);
+ __entry->sq_tail = sq_tail;
+ ),
+ TP_printk("nvme%d: %sqid=%d, head=%u, tail=%u",
+ __entry->ctrl_id, __print_disk_name(__entry->disk),
+ __entry->qid, __entry->sq_head, __entry->sq_tail
+ )
+);
+
#endif /* _TRACE_NVME_H */
#undef TRACE_INCLUDE_PATH