Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index b6a28de..cb4c300 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* NVMe over Fabrics RDMA host code.
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -119,6 +111,7 @@
struct nvme_ctrl ctrl;
bool use_inline_data;
+ u32 io_queues[HCTX_MAX_TYPES];
};
static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
@@ -162,6 +155,13 @@
return queue - queue->ctrl->queues;
}
+static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue)
+{
+ return nvme_rdma_queue_idx(queue) >
+ queue->ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+ queue->ctrl->io_queues[HCTX_TYPE_READ];
+}
+
static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
{
return queue->cmnd_capsule_len - sizeof(struct nvme_command);
@@ -213,6 +213,11 @@
if (!ring)
return NULL;
+ /*
+ * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
+ * lifetime. It's safe, since any chage in the underlying RDMA device
+ * will issue error recovery and queue re-creation.
+ */
for (i = 0; i < ib_queue_size; i++) {
if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
goto out_free_ring;
@@ -234,8 +239,15 @@
static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
{
- wait_for_completion_interruptible_timeout(&queue->cm_done,
+ int ret;
+
+ ret = wait_for_completion_interruptible_timeout(&queue->cm_done,
msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ return -ETIMEDOUT;
+ WARN_ON_ONCE(queue->cm_error > 0);
return queue->cm_error;
}
@@ -267,14 +279,9 @@
static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
struct request *rq, unsigned int hctx_idx)
{
- struct nvme_rdma_ctrl *ctrl = set->driver_data;
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
- int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
- struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
- struct nvme_rdma_device *dev = queue->device;
- nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
- DMA_TO_DEVICE);
+ kfree(req->sqe.data);
}
static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
@@ -285,15 +292,11 @@
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
- struct nvme_rdma_device *dev = queue->device;
- struct ib_device *ibdev = dev->dev;
- int ret;
nvme_req(rq)->ctrl = &ctrl->ctrl;
- ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
- DMA_TO_DEVICE);
- if (ret)
- return ret;
+ req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL);
+ if (!req->sqe.data)
+ return -ENOMEM;
req->queue = queue;
@@ -424,7 +427,7 @@
static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
{
return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
- ibdev->attrs.max_fast_reg_page_list_len);
+ ibdev->attrs.max_fast_reg_page_list_len - 1);
}
static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
@@ -433,7 +436,8 @@
const int send_wr_factor = 3; /* MR, SEND, INV */
const int cq_factor = send_wr_factor + 1; /* + RECV */
int comp_vector, idx = nvme_rdma_queue_idx(queue);
- int ret;
+ enum ib_poll_context poll_ctx;
+ int ret, pages_per_mr;
queue->device = nvme_rdma_find_get_device(queue->cm_id);
if (!queue->device) {
@@ -449,10 +453,16 @@
*/
comp_vector = idx == 0 ? idx : idx - 1;
+ /* Polling queues need direct cq polling context */
+ if (nvme_rdma_poll_queue(queue))
+ poll_ctx = IB_POLL_DIRECT;
+ else
+ poll_ctx = IB_POLL_SOFTIRQ;
+
/* +1 for ib_stop_cq */
queue->ib_cq = ib_alloc_cq(ibdev, queue,
cq_factor * queue->queue_size + 1,
- comp_vector, IB_POLL_SOFTIRQ);
+ comp_vector, poll_ctx);
if (IS_ERR(queue->ib_cq)) {
ret = PTR_ERR(queue->ib_cq);
goto out_put_dev;
@@ -469,10 +479,16 @@
goto out_destroy_qp;
}
+ /*
+ * Currently we don't use SG_GAPS MR's so if the first entry is
+ * misaligned we'll end up using two entries for a single data page,
+ * so one additional entry is required.
+ */
+ pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev) + 1;
ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
queue->queue_size,
IB_MR_TYPE_MEM_REG,
- nvme_rdma_get_max_fr_pages(ibdev));
+ pages_per_mr, 0);
if (ret) {
dev_err(queue->ctrl->ctrl.device,
"failed to initialize MR pool sized %d for QID %d\n",
@@ -552,13 +568,17 @@
return ret;
}
+static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
+{
+ rdma_disconnect(queue->cm_id);
+ ib_drain_qp(queue->qp);
+}
+
static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
{
if (!test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
return;
-
- rdma_disconnect(queue->cm_id);
- ib_drain_qp(queue->qp);
+ __nvme_rdma_stop_queue(queue);
}
static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
@@ -588,18 +608,23 @@
static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
{
+ struct nvme_rdma_queue *queue = &ctrl->queues[idx];
+ bool poll = nvme_rdma_poll_queue(queue);
int ret;
if (idx)
- ret = nvmf_connect_io_queue(&ctrl->ctrl, idx);
+ ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, poll);
else
ret = nvmf_connect_admin_queue(&ctrl->ctrl);
- if (!ret)
- set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[idx].flags);
- else
+ if (!ret) {
+ set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
+ } else {
+ if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
+ __nvme_rdma_stop_queue(queue);
dev_info(ctrl->ctrl.device,
"failed to connect queue: %d ret=%d\n", idx, ret);
+ }
return ret;
}
@@ -625,18 +650,16 @@
{
struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
struct ib_device *ibdev = ctrl->device->dev;
- unsigned int nr_io_queues;
+ unsigned int nr_io_queues, nr_default_queues;
+ unsigned int nr_read_queues, nr_poll_queues;
int i, ret;
- nr_io_queues = min(opts->nr_io_queues, num_online_cpus());
-
- /*
- * we map queues according to the device irq vectors for
- * optimal locality so we don't need more queues than
- * completion vectors.
- */
- nr_io_queues = min_t(unsigned int, nr_io_queues,
- ibdev->num_comp_vectors);
+ nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors,
+ min(opts->nr_io_queues, num_online_cpus()));
+ nr_default_queues = min_t(unsigned int, ibdev->num_comp_vectors,
+ min(opts->nr_write_queues, num_online_cpus()));
+ nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus());
+ nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues;
ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
if (ret)
@@ -649,6 +672,34 @@
dev_info(ctrl->ctrl.device,
"creating %d I/O queues.\n", nr_io_queues);
+ if (opts->nr_write_queues && nr_read_queues < nr_io_queues) {
+ /*
+ * separate read/write queues
+ * hand out dedicated default queues only after we have
+ * sufficient read queues.
+ */
+ ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues;
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
+ min(nr_default_queues, nr_io_queues);
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ } else {
+ /*
+ * shared read/write queues
+ * either no write queues were requested, or we don't have
+ * sufficient queue count to have dedicated default queues.
+ */
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] =
+ min(nr_read_queues, nr_io_queues);
+ nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ }
+
+ if (opts->nr_poll_queues && nr_io_queues) {
+ /* map dedicated poll queues only if we have queues left */
+ ctrl->io_queues[HCTX_TYPE_POLL] =
+ min(nr_poll_queues, nr_io_queues);
+ }
+
for (i = 1; i < ctrl->ctrl.queue_count; i++) {
ret = nvme_rdma_alloc_queue(ctrl, i,
ctrl->ctrl.sqsize + 1);
@@ -665,15 +716,6 @@
return ret;
}
-static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl,
- struct blk_mq_tag_set *set)
-{
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
-
- blk_mq_free_tag_set(set);
- nvme_rdma_dev_put(ctrl->device);
-}
-
static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
bool admin)
{
@@ -687,7 +729,7 @@
set->ops = &nvme_rdma_admin_mq_ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = 2; /* connect + keep-alive */
- set->numa_node = NUMA_NO_NODE;
+ set->numa_node = nctrl->numa_node;
set->cmd_size = sizeof(struct nvme_rdma_request) +
SG_CHUNK_SIZE * sizeof(struct scatterlist);
set->driver_data = ctrl;
@@ -700,35 +742,21 @@
set->ops = &nvme_rdma_mq_ops;
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = 1; /* fabric connect */
- set->numa_node = NUMA_NO_NODE;
+ set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_SHOULD_MERGE;
set->cmd_size = sizeof(struct nvme_rdma_request) +
SG_CHUNK_SIZE * sizeof(struct scatterlist);
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
+ set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
}
ret = blk_mq_alloc_tag_set(set);
if (ret)
- goto out;
-
- /*
- * We need a reference on the device as long as the tag_set is alive,
- * as the MRs in the request structures need a valid ib_device.
- */
- ret = nvme_rdma_dev_get(ctrl->device);
- if (!ret) {
- ret = -EINVAL;
- goto out_free_tagset;
- }
+ return ERR_PTR(ret);
return set;
-
-out_free_tagset:
- blk_mq_free_tag_set(set);
-out:
- return ERR_PTR(ret);
}
static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
@@ -736,7 +764,8 @@
{
if (remove) {
blk_cleanup_queue(ctrl->ctrl.admin_q);
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
+ blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
}
if (ctrl->async_event_sqe.data) {
nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
@@ -756,9 +785,15 @@
return error;
ctrl->device = ctrl->queues[0].device;
+ ctrl->ctrl.numa_node = dev_to_node(ctrl->device->dev->dma_device);
ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
+ /*
+ * Bind the async event SQE DMA mapping to the admin queue lifetime.
+ * It's safe, since any chage in the underlying RDMA device will issue
+ * error recovery and queue re-creation.
+ */
error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
sizeof(struct nvme_command), DMA_TO_DEVICE);
if (error)
@@ -771,10 +806,16 @@
goto out_free_async_qe;
}
+ ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
+ if (IS_ERR(ctrl->ctrl.fabrics_q)) {
+ error = PTR_ERR(ctrl->ctrl.fabrics_q);
+ goto out_free_tagset;
+ }
+
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.admin_q)) {
error = PTR_ERR(ctrl->ctrl.admin_q);
- goto out_free_tagset;
+ goto out_cleanup_fabrics_q;
}
}
@@ -782,23 +823,14 @@
if (error)
goto out_cleanup_queue;
- error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP,
- &ctrl->ctrl.cap);
- if (error) {
- dev_err(ctrl->ctrl.device,
- "prop_get NVME_REG_CAP failed\n");
- goto out_stop_queue;
- }
-
- ctrl->ctrl.sqsize =
- min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize);
-
- error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+ error = nvme_enable_ctrl(&ctrl->ctrl);
if (error)
goto out_stop_queue;
- ctrl->ctrl.max_hw_sectors =
- (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9);
+ ctrl->ctrl.max_segments = ctrl->max_fr_pages;
+ ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9);
+
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
error = nvme_init_identify(&ctrl->ctrl);
if (error)
@@ -811,9 +843,12 @@
out_cleanup_queue:
if (new)
blk_cleanup_queue(ctrl->ctrl.admin_q);
+out_cleanup_fabrics_q:
+ if (new)
+ blk_cleanup_queue(ctrl->ctrl.fabrics_q);
out_free_tagset:
if (new)
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
out_free_async_qe:
nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
sizeof(struct nvme_command), DMA_TO_DEVICE);
@@ -828,7 +863,7 @@
{
if (remove) {
blk_cleanup_queue(ctrl->ctrl.connect_q);
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.tagset);
}
nvme_rdma_free_io_queues(ctrl);
}
@@ -869,7 +904,7 @@
blk_cleanup_queue(ctrl->ctrl.connect_q);
out_free_tag_set:
if (new)
- nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset);
+ blk_mq_free_tag_set(ctrl->ctrl.tagset);
out_free_io_queues:
nvme_rdma_free_io_queues(ctrl);
return ret;
@@ -880,9 +915,13 @@
{
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
- blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request,
- &ctrl->ctrl);
- blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+ if (ctrl->ctrl.admin_tagset) {
+ blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset,
+ nvme_cancel_request, &ctrl->ctrl);
+ blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset);
+ }
+ if (remove)
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
nvme_rdma_destroy_admin_queue(ctrl, remove);
}
@@ -892,22 +931,17 @@
if (ctrl->ctrl.queue_count > 1) {
nvme_stop_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
- blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request,
- &ctrl->ctrl);
+ if (ctrl->ctrl.tagset) {
+ blk_mq_tagset_busy_iter(ctrl->ctrl.tagset,
+ nvme_cancel_request, &ctrl->ctrl);
+ blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset);
+ }
if (remove)
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_destroy_io_queues(ctrl, remove);
}
}
-static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl)
-{
- struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
-
- cancel_work_sync(&ctrl->err_work);
- cancel_delayed_work_sync(&ctrl->reconnect_work);
-}
-
static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
@@ -1038,6 +1072,7 @@
nvme_rdma_teardown_io_queues(ctrl, false);
nvme_start_queues(&ctrl->ctrl);
nvme_rdma_teardown_admin_queue(ctrl, false);
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
@@ -1116,7 +1151,7 @@
struct nvme_rdma_device *dev = queue->device;
struct ib_device *ibdev = dev->dev;
- if (!blk_rq_payload_bytes(rq))
+ if (!blk_rq_nr_phys_segments(rq))
return;
if (req->mr) {
@@ -1124,12 +1159,10 @@
req->mr = NULL;
}
- ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
- req->nents, rq_data_dir(rq) ==
- WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
nvme_cleanup_cmd(rq);
- sg_free_table_chained(&req->sg_table, true);
+ sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
}
static int nvme_rdma_set_sg_null(struct nvme_command *c)
@@ -1239,19 +1272,20 @@
c->common.flags |= NVME_CMD_SGL_METABUF;
- if (!blk_rq_payload_bytes(rq))
+ if (!blk_rq_nr_phys_segments(rq))
return nvme_rdma_set_sg_null(c);
req->sg_table.sgl = req->first_sgl;
ret = sg_alloc_table_chained(&req->sg_table,
- blk_rq_nr_phys_segments(rq), req->sg_table.sgl);
+ blk_rq_nr_phys_segments(rq), req->sg_table.sgl,
+ SG_CHUNK_SIZE);
if (ret)
return -ENOMEM;
req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
- rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ rq_dma_dir(rq));
if (unlikely(count <= 0)) {
ret = -EIO;
goto out_free_table;
@@ -1280,11 +1314,9 @@
return 0;
out_unmap_sg:
- ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
- req->nents, rq_data_dir(rq) ==
- WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
out_free_table:
- sg_free_table_chained(&req->sg_table, true);
+ sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
return ret;
}
@@ -1404,12 +1436,11 @@
WARN_ON_ONCE(ret);
}
-static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
- struct nvme_completion *cqe, struct ib_wc *wc, int tag)
+static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
+ struct nvme_completion *cqe, struct ib_wc *wc)
{
struct request *rq;
struct nvme_rdma_request *req;
- int ret = 0;
rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
if (!rq) {
@@ -1417,7 +1448,7 @@
"tag 0x%x on QP %#x not found\n",
cqe->command_id, queue->qp->qp_num);
nvme_rdma_error_recovery(queue->ctrl);
- return ret;
+ return;
}
req = blk_mq_rq_to_pdu(rq);
@@ -1432,6 +1463,8 @@
nvme_rdma_error_recovery(queue->ctrl);
}
} else if (req->mr) {
+ int ret;
+
ret = nvme_rdma_inv_rkey(queue, req);
if (unlikely(ret < 0)) {
dev_err(queue->ctrl->ctrl.device,
@@ -1440,19 +1473,14 @@
nvme_rdma_error_recovery(queue->ctrl);
}
/* the local invalidation completion will end the request */
- return 0;
+ return;
}
- if (refcount_dec_and_test(&req->ref)) {
- if (rq->tag == tag)
- ret = 1;
+ if (refcount_dec_and_test(&req->ref))
nvme_end_request(rq, req->status, req->result);
- }
-
- return ret;
}
-static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
+static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvme_rdma_qe *qe =
container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
@@ -1460,11 +1488,10 @@
struct ib_device *ibdev = queue->device->dev;
struct nvme_completion *cqe = qe->data;
const size_t len = sizeof(struct nvme_completion);
- int ret = 0;
if (unlikely(wc->status != IB_WC_SUCCESS)) {
nvme_rdma_wr_error(cq, wc, "RECV");
- return 0;
+ return;
}
ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
@@ -1479,16 +1506,10 @@
nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
&cqe->result);
else
- ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag);
+ nvme_rdma_process_nvme_rsp(queue, cqe, wc);
ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
nvme_rdma_post_recv(queue, qe);
- return ret;
-}
-
-static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
-{
- __nvme_rdma_recv_done(cq, wc, -1);
}
static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
@@ -1536,16 +1557,18 @@
static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
{
+ struct nvme_ctrl *ctrl = &queue->ctrl->ctrl;
int ret;
ret = nvme_rdma_create_queue_ib(queue);
if (ret)
return ret;
+ if (ctrl->opts->tos >= 0)
+ rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
if (ret) {
- dev_err(queue->ctrl->ctrl.device,
- "rdma_resolve_route failed (%d).\n",
+ dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
queue->cm_error);
goto out_destroy_queue;
}
@@ -1672,18 +1695,36 @@
nvme_rdma_timeout(struct request *rq, bool reserved)
{
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_rdma_queue *queue = req->queue;
+ struct nvme_rdma_ctrl *ctrl = queue->ctrl;
- dev_warn(req->queue->ctrl->ctrl.device,
- "I/O %d QID %d timeout, reset controller\n",
- rq->tag, nvme_rdma_queue_idx(req->queue));
+ dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
+ rq->tag, nvme_rdma_queue_idx(queue));
- /* queue error recovery */
- nvme_rdma_error_recovery(req->queue->ctrl);
+ /*
+ * Restart the timer if a controller reset is already scheduled. Any
+ * timed out commands would be handled before entering the connecting
+ * state.
+ */
+ if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
+ return BLK_EH_RESET_TIMER;
- /* fail with DNR on cmd timeout */
- nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+ if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
+ /*
+ * Teardown immediately if controller times out while starting
+ * or we are already started error recovery. all outstanding
+ * requests are completed on shutdown, so we return BLK_EH_DONE.
+ */
+ flush_work(&ctrl->err_work);
+ nvme_rdma_teardown_io_queues(ctrl, false);
+ nvme_rdma_teardown_admin_queue(ctrl, false);
+ return BLK_EH_DONE;
+ }
- return BLK_EH_DONE;
+ dev_warn(ctrl->ctrl.device, "starting error recovery\n");
+ nvme_rdma_error_recovery(ctrl);
+
+ return BLK_EH_RESET_TIMER;
}
static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -1706,12 +1747,20 @@
return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
dev = queue->device->dev;
+
+ req->sqe.dma = ib_dma_map_single(dev, req->sqe.data,
+ sizeof(struct nvme_command),
+ DMA_TO_DEVICE);
+ err = ib_dma_mapping_error(dev, req->sqe.dma);
+ if (unlikely(err))
+ return BLK_STS_RESOURCE;
+
ib_dma_sync_single_for_cpu(dev, sqe->dma,
sizeof(struct nvme_command), DMA_TO_DEVICE);
ret = nvme_setup_cmd(ns, rq, c);
if (ret)
- return ret;
+ goto unmap_qe;
blk_mq_start_request(rq);
@@ -1736,46 +1785,82 @@
}
return BLK_STS_OK;
+
err:
if (err == -ENOMEM || err == -EAGAIN)
- return BLK_STS_RESOURCE;
- return BLK_STS_IOERR;
+ ret = BLK_STS_RESOURCE;
+ else
+ ret = BLK_STS_IOERR;
+unmap_qe:
+ ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command),
+ DMA_TO_DEVICE);
+ return ret;
}
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
{
struct nvme_rdma_queue *queue = hctx->driver_data;
- struct ib_cq *cq = queue->ib_cq;
- struct ib_wc wc;
- int found = 0;
- while (ib_poll_cq(cq, 1, &wc) > 0) {
- struct ib_cqe *cqe = wc.wr_cqe;
-
- if (cqe) {
- if (cqe->done == nvme_rdma_recv_done)
- found |= __nvme_rdma_recv_done(cq, &wc, tag);
- else
- cqe->done(cq, &wc);
- }
- }
-
- return found;
+ return ib_process_cq_direct(queue->ib_cq, -1);
}
static void nvme_rdma_complete_rq(struct request *rq)
{
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_rdma_queue *queue = req->queue;
+ struct ib_device *ibdev = queue->device->dev;
- nvme_rdma_unmap_data(req->queue, rq);
+ nvme_rdma_unmap_data(queue, rq);
+ ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command),
+ DMA_TO_DEVICE);
nvme_complete_rq(rq);
}
static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
{
struct nvme_rdma_ctrl *ctrl = set->driver_data;
+ struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
- return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0);
+ if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
+ /* separate read/write queues */
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+ set->map[HCTX_TYPE_READ].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_READ];
+ set->map[HCTX_TYPE_READ].queue_offset =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ } else {
+ /* shared read/write queues */
+ set->map[HCTX_TYPE_DEFAULT].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+ set->map[HCTX_TYPE_READ].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ set->map[HCTX_TYPE_READ].queue_offset = 0;
+ }
+ blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
+ ctrl->device->dev, 0);
+ blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
+ ctrl->device->dev, 0);
+
+ if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
+ /* map dedicated poll queues only if we have queues left */
+ set->map[HCTX_TYPE_POLL].nr_queues =
+ ctrl->io_queues[HCTX_TYPE_POLL];
+ set->map[HCTX_TYPE_POLL].queue_offset =
+ ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+ ctrl->io_queues[HCTX_TYPE_READ];
+ blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
+ }
+
+ dev_info(ctrl->ctrl.device,
+ "mapped %d/%d/%d default/read/poll queues.\n",
+ ctrl->io_queues[HCTX_TYPE_DEFAULT],
+ ctrl->io_queues[HCTX_TYPE_READ],
+ ctrl->io_queues[HCTX_TYPE_POLL]);
+
+ return 0;
}
static const struct blk_mq_ops nvme_rdma_mq_ops = {
@@ -1784,9 +1869,9 @@
.init_request = nvme_rdma_init_request,
.exit_request = nvme_rdma_exit_request,
.init_hctx = nvme_rdma_init_hctx,
- .poll = nvme_rdma_poll,
.timeout = nvme_rdma_timeout,
.map_queues = nvme_rdma_map_queues,
+ .poll = nvme_rdma_poll,
};
static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
@@ -1800,11 +1885,15 @@
static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
{
+ cancel_work_sync(&ctrl->err_work);
+ cancel_delayed_work_sync(&ctrl->reconnect_work);
+
nvme_rdma_teardown_io_queues(ctrl, shutdown);
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
if (shutdown)
nvme_shutdown_ctrl(&ctrl->ctrl);
else
- nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
+ nvme_disable_ctrl(&ctrl->ctrl);
nvme_rdma_teardown_admin_queue(ctrl, shutdown);
}
@@ -1848,57 +1937,8 @@
.submit_async_event = nvme_rdma_submit_async_event,
.delete_ctrl = nvme_rdma_delete_ctrl,
.get_address = nvmf_get_address,
- .stop_ctrl = nvme_rdma_stop_ctrl,
};
-static inline bool
-__nvme_rdma_options_match(struct nvme_rdma_ctrl *ctrl,
- struct nvmf_ctrl_options *opts)
-{
- char *stdport = __stringify(NVME_RDMA_IP_PORT);
-
-
- if (!nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts) ||
- strcmp(opts->traddr, ctrl->ctrl.opts->traddr))
- return false;
-
- if (opts->mask & NVMF_OPT_TRSVCID &&
- ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
- if (strcmp(opts->trsvcid, ctrl->ctrl.opts->trsvcid))
- return false;
- } else if (opts->mask & NVMF_OPT_TRSVCID) {
- if (strcmp(opts->trsvcid, stdport))
- return false;
- } else if (ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) {
- if (strcmp(stdport, ctrl->ctrl.opts->trsvcid))
- return false;
- }
- /* else, it's a match as both have stdport. Fall to next checks */
-
- /*
- * checking the local address is rough. In most cases, one
- * is not specified and the host port is selected by the stack.
- *
- * Assume no match if:
- * local address is specified and address is not the same
- * local address is not specified but remote is, or vice versa
- * (admin using specific host_traddr when it matters).
- */
- if (opts->mask & NVMF_OPT_HOST_TRADDR &&
- ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
- if (strcmp(opts->host_traddr, ctrl->ctrl.opts->host_traddr))
- return false;
- } else if (opts->mask & NVMF_OPT_HOST_TRADDR ||
- ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
- return false;
- /*
- * if neither controller had an host port specified, assume it's
- * a match as everything else matched.
- */
-
- return true;
-}
-
/*
* Fails a connection request if it matches an existing controller
* (association) with the same tuple:
@@ -1919,7 +1959,7 @@
mutex_lock(&nvme_rdma_ctrl_mutex);
list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
- found = __nvme_rdma_options_match(ctrl, opts);
+ found = nvmf_ip_options_match(&ctrl->ctrl, opts);
if (found)
break;
}
@@ -1934,7 +1974,6 @@
struct nvme_rdma_ctrl *ctrl;
int ret;
bool changed;
- char *port;
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
if (!ctrl)
@@ -1942,15 +1981,21 @@
ctrl->ctrl.opts = opts;
INIT_LIST_HEAD(&ctrl->list);
- if (opts->mask & NVMF_OPT_TRSVCID)
- port = opts->trsvcid;
- else
- port = __stringify(NVME_RDMA_IP_PORT);
+ if (!(opts->mask & NVMF_OPT_TRSVCID)) {
+ opts->trsvcid =
+ kstrdup(__stringify(NVME_RDMA_IP_PORT), GFP_KERNEL);
+ if (!opts->trsvcid) {
+ ret = -ENOMEM;
+ goto out_free_ctrl;
+ }
+ opts->mask |= NVMF_OPT_TRSVCID;
+ }
ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
- opts->traddr, port, &ctrl->addr);
+ opts->traddr, opts->trsvcid, &ctrl->addr);
if (ret) {
- pr_err("malformed address passed: %s:%s\n", opts->traddr, port);
+ pr_err("malformed address passed: %s:%s\n",
+ opts->traddr, opts->trsvcid);
goto out_free_ctrl;
}
@@ -1974,7 +2019,8 @@
INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
- ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
+ ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
+ opts->nr_poll_queues + 1;
ctrl->ctrl.sqsize = opts->queue_size - 1;
ctrl->ctrl.kato = opts->kato;
@@ -2025,7 +2071,9 @@
.module = THIS_MODULE,
.required_opts = NVMF_OPT_TRADDR,
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
- NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
+ NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
+ NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
+ NVMF_OPT_TOS,
.create_ctrl = nvme_rdma_create_ctrl,
};
@@ -2085,8 +2133,16 @@
static void __exit nvme_rdma_cleanup_module(void)
{
+ struct nvme_rdma_ctrl *ctrl;
+
nvmf_unregister_transport(&nvme_rdma_transport);
ib_unregister_client(&nvme_rdma_ib_client);
+
+ mutex_lock(&nvme_rdma_ctrl_mutex);
+ list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list)
+ nvme_delete_ctrl(&ctrl->ctrl);
+ mutex_unlock(&nvme_rdma_ctrl_mutex);
+ flush_workqueue(nvme_delete_wq);
}
module_init(nvme_rdma_init_module);