Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 2ae8462..96b67a7 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -19,6 +19,16 @@
#define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE)
+/* Define the socket priority to use for connections were it is desirable
+ * that the NIC consider performing optimized packet processing or filtering.
+ * A non-zero value being sufficient to indicate general consideration of any
+ * possible optimization. Making it a module param allows for alternative
+ * values that may be unique for some NIC implementations.
+ */
+static int so_priority;
+module_param(so_priority, int, 0644);
+MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority");
+
#define NVMET_TCP_RECV_BUDGET 8
#define NVMET_TCP_SEND_BUDGET 8
#define NVMET_TCP_IO_WORK_BUDGET 64
@@ -84,7 +94,6 @@
struct socket *sock;
struct nvmet_tcp_port *port;
struct work_struct io_work;
- int cpu;
struct nvmet_cq nvme_cq;
struct nvmet_sq nvme_sq;
@@ -134,7 +143,6 @@
struct work_struct accept_work;
struct nvmet_port *nport;
struct sockaddr_storage addr;
- int last_cpu;
void (*data_ready)(struct sock *);
};
@@ -143,7 +151,7 @@
static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
static struct workqueue_struct *nvmet_tcp_wq;
-static struct nvmet_fabrics_ops nvmet_tcp_ops;
+static const struct nvmet_fabrics_ops nvmet_tcp_ops;
static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
@@ -209,6 +217,11 @@
list_add_tail(&cmd->entry, &cmd->queue->free_list);
}
+static inline int queue_cpu(struct nvmet_tcp_queue *queue)
+{
+ return queue->sock->sk->sk_incoming_cpu;
+}
+
static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
{
return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
@@ -321,12 +334,20 @@
kernel_sock_shutdown(queue->sock, SHUT_RDWR);
}
+static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status)
+{
+ if (status == -EPIPE || status == -ECONNRESET)
+ kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+ else
+ nvmet_tcp_fatal_error(queue);
+}
+
static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
{
struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
u32 len = le32_to_cpu(sgl->length);
- if (!cmd->req.data_len)
+ if (!len)
return 0;
if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
@@ -358,7 +379,7 @@
return NVME_SC_INTERNAL;
}
-static void nvmet_tcp_ddgst(struct ahash_request *hash,
+static void nvmet_tcp_send_ddgst(struct ahash_request *hash,
struct nvmet_tcp_cmd *cmd)
{
ahash_request_set_crypt(hash, cmd->req.sg,
@@ -366,6 +387,23 @@
crypto_ahash_digest(hash);
}
+static void nvmet_tcp_recv_ddgst(struct ahash_request *hash,
+ struct nvmet_tcp_cmd *cmd)
+{
+ struct scatterlist sg;
+ struct kvec *iov;
+ int i;
+
+ crypto_ahash_init(hash);
+ for (i = 0, iov = cmd->iov; i < cmd->nr_mapped; i++, iov++) {
+ sg_init_one(&sg, iov->iov_base, iov->iov_len);
+ ahash_request_set_crypt(hash, &sg, NULL, iov->iov_len);
+ crypto_ahash_update(hash);
+ }
+ ahash_request_set_crypt(hash, NULL, (void *)&cmd->exp_ddgst, 0);
+ crypto_ahash_final(hash);
+}
+
static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
{
struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
@@ -390,7 +428,7 @@
if (queue->data_digest) {
pdu->hdr.flags |= NVME_TCP_F_DDGST;
- nvmet_tcp_ddgst(queue->snd_hash, cmd);
+ nvmet_tcp_send_ddgst(queue->snd_hash, cmd);
}
if (cmd->queue->hdr_digest) {
@@ -447,17 +485,11 @@
static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
{
struct llist_node *node;
+ struct nvmet_tcp_cmd *cmd;
- node = llist_del_all(&queue->resp_list);
- if (!node)
- return;
-
- while (node) {
- struct nvmet_tcp_cmd *cmd = llist_entry(node,
- struct nvmet_tcp_cmd, lentry);
-
+ for (node = llist_del_all(&queue->resp_list); node; node = node->next) {
+ cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry);
list_add(&cmd->entry, &queue->resp_send_list);
- node = node->next;
queue->send_list_len++;
}
}
@@ -493,9 +525,34 @@
struct nvmet_tcp_cmd *cmd =
container_of(req, struct nvmet_tcp_cmd, req);
struct nvmet_tcp_queue *queue = cmd->queue;
+ struct nvme_sgl_desc *sgl;
+ u32 len;
+
+ if (unlikely(cmd == queue->cmd)) {
+ sgl = &cmd->req.cmd->common.dptr.sgl;
+ len = le32_to_cpu(sgl->length);
+
+ /*
+ * Wait for inline data before processing the response.
+ * Avoid using helpers, this might happen before
+ * nvmet_req_init is completed.
+ */
+ if (queue->rcv_state == NVMET_TCP_RECV_PDU &&
+ len && len <= cmd->req.port->inline_data_size &&
+ nvme_is_write(cmd->req.cmd))
+ return;
+ }
llist_add(&cmd->lentry, &queue->resp_list);
- queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work);
+ queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
+}
+
+static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd)
+{
+ if (unlikely(cmd->flags & NVMET_TCP_F_INIT_FAILED))
+ nvmet_tcp_queue_response(&cmd->req);
+ else
+ cmd->req.execute(&cmd->req);
}
static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
@@ -506,7 +563,7 @@
ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
offset_in_page(cmd->data_pdu) + cmd->offset,
- left, MSG_DONTWAIT | MSG_MORE);
+ left, MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
if (ret <= 0)
return ret;
@@ -534,7 +591,7 @@
if ((!last_in_batch && cmd->queue->send_list_len) ||
cmd->wbytes_done + left < cmd->req.transfer_len ||
queue->data_digest || !queue->nvme_sq.sqhd_disabled)
- flags |= MSG_MORE;
+ flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
left, flags);
@@ -581,7 +638,7 @@
int ret;
if (!last_in_batch && cmd->queue->send_list_len)
- flags |= MSG_MORE;
+ flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
else
flags |= MSG_EOR;
@@ -610,7 +667,7 @@
int ret;
if (!last_in_batch && cmd->queue->send_list_len)
- flags |= MSG_MORE;
+ flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
else
flags |= MSG_EOR;
@@ -628,21 +685,31 @@
return 1;
}
-static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd)
+static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
{
struct nvmet_tcp_queue *queue = cmd->queue;
+ int left = NVME_TCP_DIGEST_LENGTH - cmd->offset;
struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
struct kvec iov = {
- .iov_base = &cmd->exp_ddgst + cmd->offset,
- .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset
+ .iov_base = (u8 *)&cmd->exp_ddgst + cmd->offset,
+ .iov_len = left
};
int ret;
+ if (!last_in_batch && cmd->queue->send_list_len)
+ msg.msg_flags |= MSG_MORE;
+ else
+ msg.msg_flags |= MSG_EOR;
+
ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
if (unlikely(ret <= 0))
return ret;
cmd->offset += ret;
+ left -= ret;
+
+ if (left)
+ return -EAGAIN;
if (queue->nvme_sq.sqhd_disabled) {
cmd->queue->snd_cmd = NULL;
@@ -678,7 +745,7 @@
}
if (cmd->state == NVMET_TCP_SEND_DDGST) {
- ret = nvmet_try_send_ddgst(cmd);
+ ret = nvmet_try_send_ddgst(cmd, last_in_batch);
if (ret <= 0)
goto done_send;
}
@@ -709,11 +776,15 @@
for (i = 0; i < budget; i++) {
ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
- if (ret <= 0)
+ if (unlikely(ret < 0)) {
+ nvmet_tcp_socket_error(queue, ret);
+ goto done;
+ } else if (ret == 0) {
break;
+ }
(*sends)++;
}
-
+done:
return ret;
}
@@ -825,13 +896,11 @@
static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
{
+ size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
int ret;
- /* recover the expected data transfer length */
- req->data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
-
if (!nvme_is_write(cmd->req.cmd) ||
- req->data_len > cmd->req.port->inline_data_size) {
+ data_len > cmd->req.port->inline_data_size) {
nvmet_prepare_receive_pdu(queue);
return;
}
@@ -922,7 +991,7 @@
le32_to_cpu(req->cmd->common.dptr.sgl.length));
nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
- return -EAGAIN;
+ return 0;
}
ret = nvmet_tcp_map_data(queue->cmd);
@@ -947,7 +1016,7 @@
goto out;
}
- nvmet_req_execute(&queue->cmd->req);
+ queue->cmd->req.execute(&queue->cmd->req);
out:
nvmet_prepare_receive_pdu(queue);
return ret;
@@ -1020,7 +1089,7 @@
}
if (queue->hdr_digest &&
- nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
+ nvmet_tcp_verify_hdgst(queue, &queue->pdu, hdr->hlen)) {
nvmet_tcp_fatal_error(queue); /* fatal */
return -EPROTO;
}
@@ -1038,7 +1107,7 @@
{
struct nvmet_tcp_queue *queue = cmd->queue;
- nvmet_tcp_ddgst(queue->rcv_hash, cmd);
+ nvmet_tcp_recv_ddgst(queue->rcv_hash, cmd);
queue->offset = 0;
queue->left = NVME_TCP_DIGEST_LENGTH;
queue->rcv_state = NVMET_TCP_RECV_DDGST;
@@ -1060,16 +1129,14 @@
}
nvmet_tcp_unmap_pdu_iovec(cmd);
-
- if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
- cmd->rbytes_done == cmd->req.transfer_len) {
- if (queue->data_digest) {
- nvmet_tcp_prep_recv_ddgst(cmd);
- return 0;
- }
- nvmet_req_execute(&cmd->req);
+ if (queue->data_digest) {
+ nvmet_tcp_prep_recv_ddgst(cmd);
+ return 0;
}
+ if (cmd->rbytes_done == cmd->req.transfer_len)
+ nvmet_tcp_execute_request(cmd);
+
nvmet_prepare_receive_pdu(queue);
return 0;
}
@@ -1105,9 +1172,9 @@
goto out;
}
- if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
- cmd->rbytes_done == cmd->req.transfer_len)
- nvmet_req_execute(&cmd->req);
+ if (cmd->rbytes_done == cmd->req.transfer_len)
+ nvmet_tcp_execute_request(cmd);
+
ret = 0;
out:
nvmet_prepare_receive_pdu(queue);
@@ -1155,11 +1222,15 @@
for (i = 0; i < budget; i++) {
ret = nvmet_tcp_try_recv_one(queue);
- if (ret <= 0)
+ if (unlikely(ret < 0)) {
+ nvmet_tcp_socket_error(queue, ret);
+ goto done;
+ } else if (ret == 0) {
break;
+ }
(*recvs)++;
}
-
+done:
return ret;
}
@@ -1184,27 +1255,16 @@
pending = false;
ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
- if (ret > 0) {
+ if (ret > 0)
pending = true;
- } else if (ret < 0) {
- if (ret == -EPIPE || ret == -ECONNRESET)
- kernel_sock_shutdown(queue->sock, SHUT_RDWR);
- else
- nvmet_tcp_fatal_error(queue);
+ else if (ret < 0)
return;
- }
ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
- if (ret > 0) {
- /* transmitted message/data */
+ if (ret > 0)
pending = true;
- } else if (ret < 0) {
- if (ret == -EPIPE || ret == -ECONNRESET)
- kernel_sock_shutdown(queue->sock, SHUT_RDWR);
- else
- nvmet_tcp_fatal_error(queue);
+ else if (ret < 0)
return;
- }
} while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
@@ -1212,7 +1272,7 @@
* We exahusted our budget, requeue our selves
*/
if (pending)
- queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+ queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
}
static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
@@ -1343,6 +1403,7 @@
static void nvmet_tcp_release_queue_work(struct work_struct *w)
{
+ struct page *page;
struct nvmet_tcp_queue *queue =
container_of(w, struct nvmet_tcp_queue, release_work);
@@ -1362,6 +1423,8 @@
nvmet_tcp_free_crypto(queue);
ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
+ page = virt_to_head_page(queue->pf_cache.va);
+ __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
kfree(queue);
}
@@ -1372,7 +1435,7 @@
read_lock_bh(&sk->sk_callback_lock);
queue = sk->sk_user_data;
if (likely(queue))
- queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+ queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
read_unlock_bh(&sk->sk_callback_lock);
}
@@ -1392,7 +1455,7 @@
if (sk_stream_is_writeable(sk)) {
clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+ queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
}
out:
read_unlock_bh(&sk->sk_callback_lock);
@@ -1426,7 +1489,6 @@
{
struct socket *sock = queue->sock;
struct inet_sock *inet = inet_sk(sock->sk);
- struct linger sol = { .l_onoff = 1, .l_linger = 0 };
int ret;
ret = kernel_getsockname(sock,
@@ -1444,32 +1506,36 @@
* close. This is done to prevent stale data from being sent should
* the network connection be restored before TCP times out.
*/
- ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
- (char *)&sol, sizeof(sol));
- if (ret)
- return ret;
+ sock_no_linger(sock->sk);
+
+ if (so_priority > 0)
+ sock_set_priority(sock->sk, so_priority);
/* Set socket type of service */
- if (inet->rcv_tos > 0) {
- int tos = inet->rcv_tos;
+ if (inet->rcv_tos > 0)
+ ip_sock_set_tos(sock->sk, inet->rcv_tos);
- ret = kernel_setsockopt(sock, SOL_IP, IP_TOS,
- (char *)&tos, sizeof(tos));
- if (ret)
- return ret;
- }
-
+ ret = 0;
write_lock_bh(&sock->sk->sk_callback_lock);
- sock->sk->sk_user_data = queue;
- queue->data_ready = sock->sk->sk_data_ready;
- sock->sk->sk_data_ready = nvmet_tcp_data_ready;
- queue->state_change = sock->sk->sk_state_change;
- sock->sk->sk_state_change = nvmet_tcp_state_change;
- queue->write_space = sock->sk->sk_write_space;
- sock->sk->sk_write_space = nvmet_tcp_write_space;
+ if (sock->sk->sk_state != TCP_ESTABLISHED) {
+ /*
+ * If the socket is already closing, don't even start
+ * consuming it
+ */
+ ret = -ENOTCONN;
+ } else {
+ sock->sk->sk_user_data = queue;
+ queue->data_ready = sock->sk->sk_data_ready;
+ sock->sk->sk_data_ready = nvmet_tcp_data_ready;
+ queue->state_change = sock->sk->sk_state_change;
+ sock->sk->sk_state_change = nvmet_tcp_state_change;
+ queue->write_space = sock->sk->sk_write_space;
+ sock->sk->sk_write_space = nvmet_tcp_write_space;
+ queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
+ }
write_unlock_bh(&sock->sk->sk_callback_lock);
- return 0;
+ return ret;
}
static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
@@ -1507,9 +1573,6 @@
if (ret)
goto out_free_connect;
- port->last_cpu = cpumask_next_wrap(port->last_cpu,
- cpu_online_mask, -1, false);
- queue->cpu = port->last_cpu;
nvmet_prepare_receive_pdu(queue);
mutex_lock(&nvmet_tcp_queue_mutex);
@@ -1520,8 +1583,6 @@
if (ret)
goto out_destroy_sq;
- queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
-
return 0;
out_destroy_sq:
mutex_lock(&nvmet_tcp_queue_mutex);
@@ -1578,7 +1639,7 @@
{
struct nvmet_tcp_port *port;
__kernel_sa_family_t af;
- int opt, ret;
+ int ret;
port = kzalloc(sizeof(*port), GFP_KERNEL);
if (!port)
@@ -1607,7 +1668,6 @@
}
port->nport = nport;
- port->last_cpu = -1;
INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
if (port->nport->inline_data_size < 0)
port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
@@ -1622,21 +1682,10 @@
port->sock->sk->sk_user_data = port;
port->data_ready = port->sock->sk->sk_data_ready;
port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
-
- opt = 1;
- ret = kernel_setsockopt(port->sock, IPPROTO_TCP,
- TCP_NODELAY, (char *)&opt, sizeof(opt));
- if (ret) {
- pr_err("failed to set TCP_NODELAY sock opt %d\n", ret);
- goto err_sock;
- }
-
- ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR,
- (char *)&opt, sizeof(opt));
- if (ret) {
- pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret);
- goto err_sock;
- }
+ sock_set_reuseaddr(port->sock->sk);
+ tcp_sock_set_nodelay(port->sock->sk);
+ if (so_priority > 0)
+ sock_set_priority(port->sock->sk, so_priority);
ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
sizeof(port->addr));
@@ -1664,6 +1713,17 @@
return ret;
}
+static void nvmet_tcp_destroy_port_queues(struct nvmet_tcp_port *port)
+{
+ struct nvmet_tcp_queue *queue;
+
+ mutex_lock(&nvmet_tcp_queue_mutex);
+ list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
+ if (queue->port == port)
+ kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+ mutex_unlock(&nvmet_tcp_queue_mutex);
+}
+
static void nvmet_tcp_remove_port(struct nvmet_port *nport)
{
struct nvmet_tcp_port *port = nport->priv;
@@ -1673,6 +1733,11 @@
port->sock->sk->sk_user_data = NULL;
write_unlock_bh(&port->sock->sk->sk_callback_lock);
cancel_work_sync(&port->accept_work);
+ /*
+ * Destroy the remaining queues, which are not belong to any
+ * controller yet.
+ */
+ nvmet_tcp_destroy_port_queues(port);
sock_release(port->sock);
kfree(port);
@@ -1721,11 +1786,10 @@
}
}
-static struct nvmet_fabrics_ops nvmet_tcp_ops = {
+static const struct nvmet_fabrics_ops nvmet_tcp_ops = {
.owner = THIS_MODULE,
.type = NVMF_TRTYPE_TCP,
.msdbd = 1,
- .has_keyed_sgls = 0,
.add_port = nvmet_tcp_add_port,
.remove_port = nvmet_tcp_remove_port,
.queue_response = nvmet_tcp_queue_response,