Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 38bbbbb..6105894 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -20,6 +20,16 @@
struct nvme_tcp_queue;
+/* Define the socket priority to use for connections were it is desirable
+ * that the NIC consider performing optimized packet processing or filtering.
+ * A non-zero value being sufficient to indicate general consideration of any
+ * possible optimization. Making it a module param allows for alternative
+ * values that may be unique for some NIC implementations.
+ */
+static int so_priority;
+module_param(so_priority, int, 0644);
+MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
+
enum nvme_tcp_send_state {
NVME_TCP_SEND_CMD_PDU = 0,
NVME_TCP_SEND_H2C_PDU,
@@ -36,6 +46,7 @@
u32 pdu_sent;
u16 ttag;
struct list_head entry;
+ struct llist_node lentry;
__le32 ddgst;
struct bio *curr_bio;
@@ -50,6 +61,7 @@
enum nvme_tcp_queue_flags {
NVME_TCP_Q_ALLOCATED = 0,
NVME_TCP_Q_LIVE = 1,
+ NVME_TCP_Q_POLLING = 2,
};
enum nvme_tcp_recv_state {
@@ -64,8 +76,11 @@
struct work_struct io_work;
int io_cpu;
- spinlock_t lock;
+ struct mutex queue_lock;
+ struct mutex send_mutex;
+ struct llist_head req_list;
struct list_head send_list;
+ bool more_requests;
/* recv state */
void *pdu;
@@ -119,8 +134,9 @@
static LIST_HEAD(nvme_tcp_ctrl_list);
static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
static struct workqueue_struct *nvme_tcp_wq;
-static struct blk_mq_ops nvme_tcp_mq_ops;
-static struct blk_mq_ops nvme_tcp_admin_mq_ops;
+static const struct blk_mq_ops nvme_tcp_mq_ops;
+static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
+static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
{
@@ -247,15 +263,57 @@
}
}
-static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req)
+static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
+{
+ int ret;
+
+ /* drain the send queue as much as we can... */
+ do {
+ ret = nvme_tcp_try_send(queue);
+ } while (ret > 0);
+}
+
+static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
+{
+ return !list_empty(&queue->send_list) ||
+ !llist_empty(&queue->req_list) || queue->more_requests;
+}
+
+static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
+ bool sync, bool last)
{
struct nvme_tcp_queue *queue = req->queue;
+ bool empty;
- spin_lock(&queue->lock);
- list_add_tail(&req->entry, &queue->send_list);
- spin_unlock(&queue->lock);
+ empty = llist_add(&req->lentry, &queue->req_list) &&
+ list_empty(&queue->send_list) && !queue->request;
- queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+ /*
+ * if we're the first on the send_list and we can try to send
+ * directly, otherwise queue io_work. Also, only do that if we
+ * are on the same cpu, so we don't introduce contention.
+ */
+ if (queue->io_cpu == raw_smp_processor_id() &&
+ sync && empty && mutex_trylock(&queue->send_mutex)) {
+ queue->more_requests = !last;
+ nvme_tcp_send_all(queue);
+ queue->more_requests = false;
+ mutex_unlock(&queue->send_mutex);
+ }
+
+ if (last && nvme_tcp_queue_more(queue))
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+}
+
+static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_request *req;
+ struct llist_node *node;
+
+ for (node = llist_del_all(&queue->req_list); node; node = node->next) {
+ req = llist_entry(node, struct nvme_tcp_request, lentry);
+ list_add(&req->entry, &queue->send_list);
+ }
}
static inline struct nvme_tcp_request *
@@ -263,13 +321,17 @@
{
struct nvme_tcp_request *req;
- spin_lock(&queue->lock);
req = list_first_entry_or_null(&queue->send_list,
struct nvme_tcp_request, entry);
- if (req)
- list_del(&req->entry);
- spin_unlock(&queue->lock);
+ if (!req) {
+ nvme_tcp_process_req_list(queue);
+ req = list_first_entry_or_null(&queue->send_list,
+ struct nvme_tcp_request, entry);
+ if (unlikely(!req))
+ return NULL;
+ }
+ list_del(&req->entry);
return req;
}
@@ -429,16 +491,17 @@
{
struct request *rq;
- rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
+ rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
- "queue %d tag 0x%x not found\n",
- nvme_tcp_queue_id(queue), cqe->command_id);
+ "got bad cqe.command_id %#x on queue %d\n",
+ cqe->command_id, nvme_tcp_queue_id(queue));
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
return -EINVAL;
}
- nvme_end_request(rq, cqe->status, cqe->result);
+ if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
+ nvme_complete_rq(rq);
queue->nr_cqe++;
return 0;
@@ -449,11 +512,11 @@
{
struct request *rq;
- rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+ rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
- "queue %d tag %#x not found\n",
- nvme_tcp_queue_id(queue), pdu->command_id);
+ "got bad c2hdata.command_id %#x on queue %d\n",
+ pdu->command_id, nvme_tcp_queue_id(queue));
return -ENOENT;
}
@@ -490,8 +553,8 @@
* aborts. We don't even bother to allocate a struct request
* for them but rather special case them here.
*/
- if (unlikely(nvme_tcp_queue_id(queue) == 0 &&
- cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
+ if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
+ cqe->command_id)))
nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
&cqe->result);
else
@@ -547,8 +610,8 @@
data->hdr.plen =
cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
data->ttag = pdu->ttag;
- data->command_id = rq->tag;
- data->data_offset = cpu_to_le32(req->data_sent);
+ data->command_id = nvme_cid(rq);
+ data->data_offset = pdu->r2t_offset;
data->data_length = cpu_to_le32(req->pdu_len);
return 0;
}
@@ -560,11 +623,11 @@
struct request *rq;
int ret;
- rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+ rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
if (!rq) {
dev_err(queue->ctrl->ctrl.device,
- "queue %d tag %#x not found\n",
- nvme_tcp_queue_id(queue), pdu->command_id);
+ "got bad r2t.command_id %#x on queue %d\n",
+ pdu->command_id, nvme_tcp_queue_id(queue));
return -ENOENT;
}
req = blk_mq_rq_to_pdu(rq);
@@ -576,7 +639,7 @@
req->state = NVME_TCP_SEND_H2C_PDU;
req->offset = 0;
- nvme_tcp_queue_request(req);
+ nvme_tcp_queue_request(req, false, true);
return 0;
}
@@ -635,7 +698,8 @@
{
union nvme_result res = {};
- nvme_end_request(rq, cpu_to_le16(status << 1), res);
+ if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
+ nvme_complete_rq(rq);
}
static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
@@ -643,7 +707,7 @@
{
struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
struct request *rq =
- blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+ nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
while (true) {
@@ -736,8 +800,8 @@
}
if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
- struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue),
- pdu->command_id);
+ struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
+ pdu->command_id);
nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
queue->nr_cqe++;
@@ -786,7 +850,8 @@
read_lock_bh(&sk->sk_callback_lock);
queue = sk->sk_user_data;
- if (likely(queue && queue->rd_enabled))
+ if (likely(queue && queue->rd_enabled) &&
+ !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
read_unlock_bh(&sk->sk_callback_lock);
}
@@ -819,7 +884,6 @@
case TCP_LAST_ACK:
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
- /* fallthrough */
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
break;
default:
@@ -840,24 +904,34 @@
static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
{
- nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);
+ if (nvme_tcp_async_req(req)) {
+ union nvme_result res = {};
+
+ nvme_complete_async_event(&req->queue->ctrl->ctrl,
+ cpu_to_le16(NVME_SC_HOST_PATH_ERROR), &res);
+ } else {
+ nvme_tcp_end_request(blk_mq_rq_from_pdu(req),
+ NVME_SC_HOST_PATH_ERROR);
+ }
}
static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
+ int req_data_len = req->data_len;
while (true) {
struct page *page = nvme_tcp_req_cur_page(req);
size_t offset = nvme_tcp_req_cur_offset(req);
size_t len = nvme_tcp_req_cur_length(req);
bool last = nvme_tcp_pdu_last_send(req, len);
+ int req_data_sent = req->data_sent;
int ret, flags = MSG_DONTWAIT;
- if (last && !queue->data_digest)
+ if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
flags |= MSG_EOR;
else
- flags |= MSG_MORE;
+ flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
if (sendpage_ok(page)) {
ret = kernel_sendpage(queue->sock, page, offset, len,
@@ -869,12 +943,19 @@
if (ret <= 0)
return ret;
- nvme_tcp_advance_req(req, ret);
if (queue->data_digest)
nvme_tcp_ddgst_update(queue->snd_hash, page,
offset, ret);
- /* fully successful last write*/
+ /*
+ * update the request iterator except for the last payload send
+ * in the request where we don't want to modify it as we may
+ * compete with the RX path completing the request.
+ */
+ if (req_data_sent + ret < req_data_len)
+ nvme_tcp_advance_req(req, ret);
+
+ /* fully successful last send in current PDU */
if (last && ret == len) {
if (queue->data_digest) {
nvme_tcp_ddgst_final(queue->snd_hash,
@@ -895,11 +976,16 @@
struct nvme_tcp_queue *queue = req->queue;
struct nvme_tcp_cmd_pdu *pdu = req->pdu;
bool inline_data = nvme_tcp_has_inline_data(req);
- int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR);
u8 hdgst = nvme_tcp_hdgst_len(queue);
int len = sizeof(*pdu) + hdgst - req->offset;
+ int flags = MSG_DONTWAIT;
int ret;
+ if (inline_data || nvme_tcp_queue_more(queue))
+ flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ else
+ flags |= MSG_EOR;
+
if (queue->hdr_digest && !req->offset)
nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
@@ -938,7 +1024,7 @@
ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
offset_in_page(pdu) + req->offset, len,
- MSG_DONTWAIT | MSG_MORE);
+ MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
if (unlikely(ret <= 0))
return ret;
@@ -959,18 +1045,24 @@
static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
{
struct nvme_tcp_queue *queue = req->queue;
+ size_t offset = req->offset;
int ret;
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
struct kvec iov = {
- .iov_base = &req->ddgst + req->offset,
+ .iov_base = (u8 *)&req->ddgst + req->offset,
.iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
};
+ if (nvme_tcp_queue_more(queue))
+ msg.msg_flags |= MSG_MORE;
+ else
+ msg.msg_flags |= MSG_EOR;
+
ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
if (unlikely(ret <= 0))
return ret;
- if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
+ if (offset + ret == NVME_TCP_DIGEST_LENGTH) {
nvme_tcp_done_send_req(queue);
return 1;
}
@@ -1014,8 +1106,15 @@
if (req->state == NVME_TCP_SEND_DDGST)
ret = nvme_tcp_try_send_ddgst(req);
done:
- if (ret == -EAGAIN)
+ if (ret == -EAGAIN) {
ret = 0;
+ } else if (ret < 0) {
+ dev_err(queue->ctrl->ctrl.device,
+ "failed to send request %d\n", ret);
+ if (ret != -EPIPE && ret != -ECONNRESET)
+ nvme_tcp_fail_request(queue->request);
+ nvme_tcp_done_send_req(queue);
+ }
return ret;
}
@@ -1045,26 +1144,20 @@
bool pending = false;
int result;
- result = nvme_tcp_try_send(queue);
- if (result > 0) {
- pending = true;
- } else if (unlikely(result < 0)) {
- dev_err(queue->ctrl->ctrl.device,
- "failed to send request %d\n", result);
-
- /*
- * Fail the request unless peer closed the connection,
- * in which case error recovery flow will complete all.
- */
- if ((result != -EPIPE) && (result != -ECONNRESET))
- nvme_tcp_fail_request(queue->request);
- nvme_tcp_done_send_req(queue);
- return;
+ if (mutex_trylock(&queue->send_mutex)) {
+ result = nvme_tcp_try_send(queue);
+ mutex_unlock(&queue->send_mutex);
+ if (result > 0)
+ pending = true;
+ else if (unlikely(result < 0))
+ break;
}
result = nvme_tcp_try_recv(queue);
if (result > 0)
pending = true;
+ else if (unlikely(result < 0))
+ return;
if (!pending)
return;
@@ -1145,6 +1238,7 @@
sock_release(queue->sock);
kfree(queue->pdu);
+ mutex_destroy(&queue->queue_lock);
}
static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
@@ -1245,17 +1339,72 @@
return ret;
}
+static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
+{
+ return nvme_tcp_queue_id(queue) == 0;
+}
+
+static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_ctrl *ctrl = queue->ctrl;
+ int qid = nvme_tcp_queue_id(queue);
+
+ return !nvme_tcp_admin_queue(queue) &&
+ qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
+}
+
+static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_ctrl *ctrl = queue->ctrl;
+ int qid = nvme_tcp_queue_id(queue);
+
+ return !nvme_tcp_admin_queue(queue) &&
+ !nvme_tcp_default_queue(queue) &&
+ qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+ ctrl->io_queues[HCTX_TYPE_READ];
+}
+
+static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_ctrl *ctrl = queue->ctrl;
+ int qid = nvme_tcp_queue_id(queue);
+
+ return !nvme_tcp_admin_queue(queue) &&
+ !nvme_tcp_default_queue(queue) &&
+ !nvme_tcp_read_queue(queue) &&
+ qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
+ ctrl->io_queues[HCTX_TYPE_READ] +
+ ctrl->io_queues[HCTX_TYPE_POLL];
+}
+
+static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
+{
+ struct nvme_tcp_ctrl *ctrl = queue->ctrl;
+ int qid = nvme_tcp_queue_id(queue);
+ int n = 0;
+
+ if (nvme_tcp_default_queue(queue))
+ n = qid - 1;
+ else if (nvme_tcp_read_queue(queue))
+ n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
+ else if (nvme_tcp_poll_queue(queue))
+ n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
+ ctrl->io_queues[HCTX_TYPE_READ] - 1;
+ queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+}
+
static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
int qid, size_t queue_size)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
- struct linger sol = { .l_onoff = 1, .l_linger = 0 };
- int ret, opt, rcv_pdu_size, n;
+ int ret, rcv_pdu_size;
+ mutex_init(&queue->queue_lock);
queue->ctrl = ctrl;
+ init_llist_head(&queue->req_list);
INIT_LIST_HEAD(&queue->send_list);
- spin_lock_init(&queue->lock);
+ mutex_init(&queue->send_mutex);
INIT_WORK(&queue->io_work, nvme_tcp_io_work);
queue->queue_size = queue_size;
@@ -1270,63 +1419,34 @@
if (ret) {
dev_err(nctrl->device,
"failed to create socket: %d\n", ret);
- return ret;
+ goto err_destroy_mutex;
}
/* Single syn retry */
- opt = 1;
- ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
- (char *)&opt, sizeof(opt));
- if (ret) {
- dev_err(nctrl->device,
- "failed to set TCP_SYNCNT sock opt %d\n", ret);
- goto err_sock;
- }
+ tcp_sock_set_syncnt(queue->sock->sk, 1);
/* Set TCP no delay */
- opt = 1;
- ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
- TCP_NODELAY, (char *)&opt, sizeof(opt));
- if (ret) {
- dev_err(nctrl->device,
- "failed to set TCP_NODELAY sock opt %d\n", ret);
- goto err_sock;
- }
+ tcp_sock_set_nodelay(queue->sock->sk);
/*
* Cleanup whatever is sitting in the TCP transmit queue on socket
* close. This is done to prevent stale data from being sent should
* the network connection be restored before TCP times out.
*/
- ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
- (char *)&sol, sizeof(sol));
- if (ret) {
- dev_err(nctrl->device,
- "failed to set SO_LINGER sock opt %d\n", ret);
- goto err_sock;
- }
+ sock_no_linger(queue->sock->sk);
+
+ if (so_priority > 0)
+ sock_set_priority(queue->sock->sk, so_priority);
/* Set socket type of service */
- if (nctrl->opts->tos >= 0) {
- opt = nctrl->opts->tos;
- ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS,
- (char *)&opt, sizeof(opt));
- if (ret) {
- dev_err(nctrl->device,
- "failed to set IP_TOS sock opt %d\n", ret);
- goto err_sock;
- }
- }
+ if (nctrl->opts->tos >= 0)
+ ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
/* Set 10 seconds timeout for icresp recvmsg */
queue->sock->sk->sk_rcvtimeo = 10 * HZ;
queue->sock->sk->sk_allocation = GFP_ATOMIC;
- if (!qid)
- n = 0;
- else
- n = (qid - 1) % num_online_cpus();
- queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+ nvme_tcp_set_queue_io_cpu(queue);
queue->request = NULL;
queue->data_remaining = 0;
queue->ddgst_remaining = 0;
@@ -1408,6 +1528,8 @@
err_sock:
sock_release(queue->sock);
queue->sock = NULL;
+err_destroy_mutex:
+ mutex_destroy(&queue->queue_lock);
return ret;
}
@@ -1435,9 +1557,10 @@
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid];
- if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
- return;
- __nvme_tcp_stop_queue(queue);
+ mutex_lock(&queue->queue_lock);
+ if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
+ __nvme_tcp_stop_queue(queue);
+ mutex_unlock(&queue->queue_lock);
}
static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
@@ -1474,7 +1597,8 @@
set->ops = &nvme_tcp_admin_mq_ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = 2; /* connect + keep-alive */
- set->numa_node = NUMA_NO_NODE;
+ set->numa_node = nctrl->numa_node;
+ set->flags = BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
set->driver_data = ctrl;
set->nr_hw_queues = 1;
@@ -1485,8 +1609,8 @@
set->ops = &nvme_tcp_mq_ops;
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = 1; /* fabric connect */
- set->numa_node = NUMA_NO_NODE;
- set->flags = BLK_MQ_F_SHOULD_MERGE;
+ set->numa_node = nctrl->numa_node;
+ set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
@@ -1890,8 +2014,14 @@
}
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
- /* state change failure is ok if we're in DELETING state */
- WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+ /*
+ * state change failure is ok if we started ctrl delete,
+ * unless we're during creation of a new controller to
+ * avoid races with teardown flow.
+ */
+ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
+ ctrl->state != NVME_CTRL_DELETING_NOIO);
+ WARN_ON_ONCE(new);
ret = -EINVAL;
goto destroy_io;
}
@@ -1947,6 +2077,7 @@
struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
nvme_stop_keep_alive(ctrl);
+ flush_work(&ctrl->async_event_work);
nvme_tcp_teardown_io_queues(ctrl, false);
/* unquiesce to fail fast pending requests */
nvme_start_queues(ctrl);
@@ -1954,8 +2085,9 @@
blk_mq_unquiesce_queue(ctrl->admin_q);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
- /* state change failure is ok if we're in DELETING state */
- WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+ /* state change failure is ok if we started ctrl delete */
+ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
+ ctrl->state != NVME_CTRL_DELETING_NOIO);
return;
}
@@ -1990,8 +2122,9 @@
nvme_tcp_teardown_ctrl(ctrl, false);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
- /* state change failure is ok if we're in DELETING state */
- WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+ /* state change failure is ok if we started ctrl delete */
+ WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
+ ctrl->state != NVME_CTRL_DELETING_NOIO);
return;
}
@@ -2078,7 +2211,7 @@
ctrl->async_req.curr_bio = NULL;
ctrl->async_req.data_len = 0;
- nvme_tcp_queue_request(&ctrl->async_req);
+ nvme_tcp_queue_request(&ctrl->async_req, true, true);
}
static void nvme_tcp_complete_timed_out(struct request *rq)
@@ -2202,6 +2335,14 @@
return 0;
}
+static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+ struct nvme_tcp_queue *queue = hctx->driver_data;
+
+ if (!llist_empty(&queue->req_list))
+ queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+}
+
static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
@@ -2221,7 +2362,7 @@
blk_mq_start_request(rq);
- nvme_tcp_queue_request(req);
+ nvme_tcp_queue_request(req, true, bd->last);
return BLK_STS_OK;
}
@@ -2276,14 +2417,20 @@
struct nvme_tcp_queue *queue = hctx->driver_data;
struct sock *sk = queue->sock->sk;
+ if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
+ return 0;
+
+ set_bit(NVME_TCP_Q_POLLING, &queue->flags);
if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
sk_busy_loop(sk, true);
nvme_tcp_try_recv(queue);
+ clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
return queue->nr_cqe;
}
-static struct blk_mq_ops nvme_tcp_mq_ops = {
+static const struct blk_mq_ops nvme_tcp_mq_ops = {
.queue_rq = nvme_tcp_queue_rq,
+ .commit_rqs = nvme_tcp_commit_rqs,
.complete = nvme_complete_rq,
.init_request = nvme_tcp_init_request,
.exit_request = nvme_tcp_exit_request,
@@ -2293,7 +2440,7 @@
.poll = nvme_tcp_poll,
};
-static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
+static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
.queue_rq = nvme_tcp_queue_rq,
.complete = nvme_complete_rq,
.init_request = nvme_tcp_init_request,
@@ -2420,7 +2567,6 @@
out_uninit_ctrl:
nvme_uninit_ctrl(&ctrl->ctrl);
nvme_put_ctrl(&ctrl->ctrl);
- nvme_put_ctrl(&ctrl->ctrl);
if (ret > 0)
ret = -EIO;
return ERR_PTR(ret);