blob: 2ae846297d7ca8c474fea0a888623102d1db8ba2 [file] [log] [blame]
David Brazdil0f672f62019-12-10 10:32:29 +00001// SPDX-License-Identifier: GPL-2.0
2/*
3 * NVMe over Fabrics TCP target.
4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5 */
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7#include <linux/module.h>
8#include <linux/init.h>
9#include <linux/slab.h>
10#include <linux/err.h>
11#include <linux/nvme-tcp.h>
12#include <net/sock.h>
13#include <net/tcp.h>
14#include <linux/inet.h>
15#include <linux/llist.h>
16#include <crypto/hash.h>
17
18#include "nvmet.h"
19
20#define NVMET_TCP_DEF_INLINE_DATA_SIZE (4 * PAGE_SIZE)
21
22#define NVMET_TCP_RECV_BUDGET 8
23#define NVMET_TCP_SEND_BUDGET 8
24#define NVMET_TCP_IO_WORK_BUDGET 64
25
26enum nvmet_tcp_send_state {
27 NVMET_TCP_SEND_DATA_PDU,
28 NVMET_TCP_SEND_DATA,
29 NVMET_TCP_SEND_R2T,
30 NVMET_TCP_SEND_DDGST,
31 NVMET_TCP_SEND_RESPONSE
32};
33
34enum nvmet_tcp_recv_state {
35 NVMET_TCP_RECV_PDU,
36 NVMET_TCP_RECV_DATA,
37 NVMET_TCP_RECV_DDGST,
38 NVMET_TCP_RECV_ERR,
39};
40
41enum {
42 NVMET_TCP_F_INIT_FAILED = (1 << 0),
43};
44
45struct nvmet_tcp_cmd {
46 struct nvmet_tcp_queue *queue;
47 struct nvmet_req req;
48
49 struct nvme_tcp_cmd_pdu *cmd_pdu;
50 struct nvme_tcp_rsp_pdu *rsp_pdu;
51 struct nvme_tcp_data_pdu *data_pdu;
52 struct nvme_tcp_r2t_pdu *r2t_pdu;
53
54 u32 rbytes_done;
55 u32 wbytes_done;
56
57 u32 pdu_len;
58 u32 pdu_recv;
59 int sg_idx;
60 int nr_mapped;
61 struct msghdr recv_msg;
62 struct kvec *iov;
63 u32 flags;
64
65 struct list_head entry;
66 struct llist_node lentry;
67
68 /* send state */
69 u32 offset;
70 struct scatterlist *cur_sg;
71 enum nvmet_tcp_send_state state;
72
73 __le32 exp_ddgst;
74 __le32 recv_ddgst;
75};
76
77enum nvmet_tcp_queue_state {
78 NVMET_TCP_Q_CONNECTING,
79 NVMET_TCP_Q_LIVE,
80 NVMET_TCP_Q_DISCONNECTING,
81};
82
83struct nvmet_tcp_queue {
84 struct socket *sock;
85 struct nvmet_tcp_port *port;
86 struct work_struct io_work;
87 int cpu;
88 struct nvmet_cq nvme_cq;
89 struct nvmet_sq nvme_sq;
90
91 /* send state */
92 struct nvmet_tcp_cmd *cmds;
93 unsigned int nr_cmds;
94 struct list_head free_list;
95 struct llist_head resp_list;
96 struct list_head resp_send_list;
97 int send_list_len;
98 struct nvmet_tcp_cmd *snd_cmd;
99
100 /* recv state */
101 int offset;
102 int left;
103 enum nvmet_tcp_recv_state rcv_state;
104 struct nvmet_tcp_cmd *cmd;
105 union nvme_tcp_pdu pdu;
106
107 /* digest state */
108 bool hdr_digest;
109 bool data_digest;
110 struct ahash_request *snd_hash;
111 struct ahash_request *rcv_hash;
112
113 spinlock_t state_lock;
114 enum nvmet_tcp_queue_state state;
115
116 struct sockaddr_storage sockaddr;
117 struct sockaddr_storage sockaddr_peer;
118 struct work_struct release_work;
119
120 int idx;
121 struct list_head queue_list;
122
123 struct nvmet_tcp_cmd connect;
124
125 struct page_frag_cache pf_cache;
126
127 void (*data_ready)(struct sock *);
128 void (*state_change)(struct sock *);
129 void (*write_space)(struct sock *);
130};
131
132struct nvmet_tcp_port {
133 struct socket *sock;
134 struct work_struct accept_work;
135 struct nvmet_port *nport;
136 struct sockaddr_storage addr;
137 int last_cpu;
138 void (*data_ready)(struct sock *);
139};
140
141static DEFINE_IDA(nvmet_tcp_queue_ida);
142static LIST_HEAD(nvmet_tcp_queue_list);
143static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
144
145static struct workqueue_struct *nvmet_tcp_wq;
146static struct nvmet_fabrics_ops nvmet_tcp_ops;
147static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
148static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
149
150static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
151 struct nvmet_tcp_cmd *cmd)
152{
Olivier Deprez0e641232021-09-23 10:07:05 +0200153 if (unlikely(!queue->nr_cmds)) {
154 /* We didn't allocate cmds yet, send 0xffff */
155 return USHRT_MAX;
156 }
157
David Brazdil0f672f62019-12-10 10:32:29 +0000158 return cmd - queue->cmds;
159}
160
161static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
162{
163 return nvme_is_write(cmd->req.cmd) &&
164 cmd->rbytes_done < cmd->req.transfer_len;
165}
166
167static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
168{
169 return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status;
170}
171
172static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
173{
174 return !nvme_is_write(cmd->req.cmd) &&
175 cmd->req.transfer_len > 0 &&
176 !cmd->req.cqe->status;
177}
178
179static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
180{
181 return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
182 !cmd->rbytes_done;
183}
184
185static inline struct nvmet_tcp_cmd *
186nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
187{
188 struct nvmet_tcp_cmd *cmd;
189
190 cmd = list_first_entry_or_null(&queue->free_list,
191 struct nvmet_tcp_cmd, entry);
192 if (!cmd)
193 return NULL;
194 list_del_init(&cmd->entry);
195
196 cmd->rbytes_done = cmd->wbytes_done = 0;
197 cmd->pdu_len = 0;
198 cmd->pdu_recv = 0;
199 cmd->iov = NULL;
200 cmd->flags = 0;
201 return cmd;
202}
203
204static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
205{
206 if (unlikely(cmd == &cmd->queue->connect))
207 return;
208
209 list_add_tail(&cmd->entry, &cmd->queue->free_list);
210}
211
212static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
213{
214 return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
215}
216
217static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
218{
219 return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
220}
221
222static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
223 void *pdu, size_t len)
224{
225 struct scatterlist sg;
226
227 sg_init_one(&sg, pdu, len);
228 ahash_request_set_crypt(hash, &sg, pdu + len, len);
229 crypto_ahash_digest(hash);
230}
231
232static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
233 void *pdu, size_t len)
234{
235 struct nvme_tcp_hdr *hdr = pdu;
236 __le32 recv_digest;
237 __le32 exp_digest;
238
239 if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
240 pr_err("queue %d: header digest enabled but no header digest\n",
241 queue->idx);
242 return -EPROTO;
243 }
244
245 recv_digest = *(__le32 *)(pdu + hdr->hlen);
246 nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
247 exp_digest = *(__le32 *)(pdu + hdr->hlen);
248 if (recv_digest != exp_digest) {
249 pr_err("queue %d: header digest error: recv %#x expected %#x\n",
250 queue->idx, le32_to_cpu(recv_digest),
251 le32_to_cpu(exp_digest));
252 return -EPROTO;
253 }
254
255 return 0;
256}
257
258static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
259{
260 struct nvme_tcp_hdr *hdr = pdu;
261 u8 digest_len = nvmet_tcp_hdgst_len(queue);
262 u32 len;
263
264 len = le32_to_cpu(hdr->plen) - hdr->hlen -
265 (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
266
267 if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
268 pr_err("queue %d: data digest flag is cleared\n", queue->idx);
269 return -EPROTO;
270 }
271
272 return 0;
273}
274
275static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
276{
277 struct scatterlist *sg;
278 int i;
279
280 sg = &cmd->req.sg[cmd->sg_idx];
281
282 for (i = 0; i < cmd->nr_mapped; i++)
283 kunmap(sg_page(&sg[i]));
284}
285
286static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
287{
288 struct kvec *iov = cmd->iov;
289 struct scatterlist *sg;
290 u32 length, offset, sg_offset;
291
292 length = cmd->pdu_len;
293 cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE);
294 offset = cmd->rbytes_done;
Olivier Deprez0e641232021-09-23 10:07:05 +0200295 cmd->sg_idx = offset / PAGE_SIZE;
David Brazdil0f672f62019-12-10 10:32:29 +0000296 sg_offset = offset % PAGE_SIZE;
297 sg = &cmd->req.sg[cmd->sg_idx];
298
299 while (length) {
300 u32 iov_len = min_t(u32, length, sg->length - sg_offset);
301
302 iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset;
303 iov->iov_len = iov_len;
304
305 length -= iov_len;
306 sg = sg_next(sg);
307 iov++;
Olivier Deprez0e641232021-09-23 10:07:05 +0200308 sg_offset = 0;
David Brazdil0f672f62019-12-10 10:32:29 +0000309 }
310
311 iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
312 cmd->nr_mapped, cmd->pdu_len);
313}
314
315static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
316{
317 queue->rcv_state = NVMET_TCP_RECV_ERR;
318 if (queue->nvme_sq.ctrl)
319 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
320 else
321 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
322}
323
324static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
325{
326 struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
327 u32 len = le32_to_cpu(sgl->length);
328
329 if (!cmd->req.data_len)
330 return 0;
331
332 if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
333 NVME_SGL_FMT_OFFSET)) {
334 if (!nvme_is_write(cmd->req.cmd))
335 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
336
337 if (len > cmd->req.port->inline_data_size)
338 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
339 cmd->pdu_len = len;
340 }
341 cmd->req.transfer_len += len;
342
343 cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
344 if (!cmd->req.sg)
345 return NVME_SC_INTERNAL;
346 cmd->cur_sg = cmd->req.sg;
347
348 if (nvmet_tcp_has_data_in(cmd)) {
349 cmd->iov = kmalloc_array(cmd->req.sg_cnt,
350 sizeof(*cmd->iov), GFP_KERNEL);
351 if (!cmd->iov)
352 goto err;
353 }
354
355 return 0;
356err:
357 sgl_free(cmd->req.sg);
358 return NVME_SC_INTERNAL;
359}
360
361static void nvmet_tcp_ddgst(struct ahash_request *hash,
362 struct nvmet_tcp_cmd *cmd)
363{
364 ahash_request_set_crypt(hash, cmd->req.sg,
365 (void *)&cmd->exp_ddgst, cmd->req.transfer_len);
366 crypto_ahash_digest(hash);
367}
368
369static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
370{
371 struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
372 struct nvmet_tcp_queue *queue = cmd->queue;
373 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
374 u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
375
376 cmd->offset = 0;
377 cmd->state = NVMET_TCP_SEND_DATA_PDU;
378
379 pdu->hdr.type = nvme_tcp_c2h_data;
380 pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ?
381 NVME_TCP_F_DATA_SUCCESS : 0);
382 pdu->hdr.hlen = sizeof(*pdu);
383 pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
384 pdu->hdr.plen =
385 cpu_to_le32(pdu->hdr.hlen + hdgst +
386 cmd->req.transfer_len + ddgst);
387 pdu->command_id = cmd->req.cqe->command_id;
388 pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
389 pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
390
391 if (queue->data_digest) {
392 pdu->hdr.flags |= NVME_TCP_F_DDGST;
393 nvmet_tcp_ddgst(queue->snd_hash, cmd);
394 }
395
396 if (cmd->queue->hdr_digest) {
397 pdu->hdr.flags |= NVME_TCP_F_HDGST;
398 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
399 }
400}
401
402static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
403{
404 struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
405 struct nvmet_tcp_queue *queue = cmd->queue;
406 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
407
408 cmd->offset = 0;
409 cmd->state = NVMET_TCP_SEND_R2T;
410
411 pdu->hdr.type = nvme_tcp_r2t;
412 pdu->hdr.flags = 0;
413 pdu->hdr.hlen = sizeof(*pdu);
414 pdu->hdr.pdo = 0;
415 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
416
417 pdu->command_id = cmd->req.cmd->common.command_id;
418 pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
419 pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
420 pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
421 if (cmd->queue->hdr_digest) {
422 pdu->hdr.flags |= NVME_TCP_F_HDGST;
423 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
424 }
425}
426
427static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
428{
429 struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
430 struct nvmet_tcp_queue *queue = cmd->queue;
431 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
432
433 cmd->offset = 0;
434 cmd->state = NVMET_TCP_SEND_RESPONSE;
435
436 pdu->hdr.type = nvme_tcp_rsp;
437 pdu->hdr.flags = 0;
438 pdu->hdr.hlen = sizeof(*pdu);
439 pdu->hdr.pdo = 0;
440 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
441 if (cmd->queue->hdr_digest) {
442 pdu->hdr.flags |= NVME_TCP_F_HDGST;
443 nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
444 }
445}
446
447static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
448{
449 struct llist_node *node;
450
451 node = llist_del_all(&queue->resp_list);
452 if (!node)
453 return;
454
455 while (node) {
456 struct nvmet_tcp_cmd *cmd = llist_entry(node,
457 struct nvmet_tcp_cmd, lentry);
458
459 list_add(&cmd->entry, &queue->resp_send_list);
460 node = node->next;
461 queue->send_list_len++;
462 }
463}
464
465static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
466{
467 queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
468 struct nvmet_tcp_cmd, entry);
469 if (!queue->snd_cmd) {
470 nvmet_tcp_process_resp_list(queue);
471 queue->snd_cmd =
472 list_first_entry_or_null(&queue->resp_send_list,
473 struct nvmet_tcp_cmd, entry);
474 if (unlikely(!queue->snd_cmd))
475 return NULL;
476 }
477
478 list_del_init(&queue->snd_cmd->entry);
479 queue->send_list_len--;
480
481 if (nvmet_tcp_need_data_out(queue->snd_cmd))
482 nvmet_setup_c2h_data_pdu(queue->snd_cmd);
483 else if (nvmet_tcp_need_data_in(queue->snd_cmd))
484 nvmet_setup_r2t_pdu(queue->snd_cmd);
485 else
486 nvmet_setup_response_pdu(queue->snd_cmd);
487
488 return queue->snd_cmd;
489}
490
491static void nvmet_tcp_queue_response(struct nvmet_req *req)
492{
493 struct nvmet_tcp_cmd *cmd =
494 container_of(req, struct nvmet_tcp_cmd, req);
495 struct nvmet_tcp_queue *queue = cmd->queue;
496
497 llist_add(&cmd->lentry, &queue->resp_list);
498 queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work);
499}
500
501static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
502{
503 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
504 int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
505 int ret;
506
507 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
508 offset_in_page(cmd->data_pdu) + cmd->offset,
509 left, MSG_DONTWAIT | MSG_MORE);
510 if (ret <= 0)
511 return ret;
512
513 cmd->offset += ret;
514 left -= ret;
515
516 if (left)
517 return -EAGAIN;
518
519 cmd->state = NVMET_TCP_SEND_DATA;
520 cmd->offset = 0;
521 return 1;
522}
523
Olivier Deprez0e641232021-09-23 10:07:05 +0200524static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
David Brazdil0f672f62019-12-10 10:32:29 +0000525{
526 struct nvmet_tcp_queue *queue = cmd->queue;
527 int ret;
528
529 while (cmd->cur_sg) {
530 struct page *page = sg_page(cmd->cur_sg);
531 u32 left = cmd->cur_sg->length - cmd->offset;
Olivier Deprez0e641232021-09-23 10:07:05 +0200532 int flags = MSG_DONTWAIT;
533
534 if ((!last_in_batch && cmd->queue->send_list_len) ||
535 cmd->wbytes_done + left < cmd->req.transfer_len ||
536 queue->data_digest || !queue->nvme_sq.sqhd_disabled)
537 flags |= MSG_MORE;
David Brazdil0f672f62019-12-10 10:32:29 +0000538
539 ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
Olivier Deprez0e641232021-09-23 10:07:05 +0200540 left, flags);
David Brazdil0f672f62019-12-10 10:32:29 +0000541 if (ret <= 0)
542 return ret;
543
544 cmd->offset += ret;
545 cmd->wbytes_done += ret;
546
547 /* Done with sg?*/
548 if (cmd->offset == cmd->cur_sg->length) {
549 cmd->cur_sg = sg_next(cmd->cur_sg);
550 cmd->offset = 0;
551 }
552 }
553
554 if (queue->data_digest) {
555 cmd->state = NVMET_TCP_SEND_DDGST;
556 cmd->offset = 0;
557 } else {
558 if (queue->nvme_sq.sqhd_disabled) {
559 cmd->queue->snd_cmd = NULL;
560 nvmet_tcp_put_cmd(cmd);
561 } else {
562 nvmet_setup_response_pdu(cmd);
563 }
564 }
565
566 if (queue->nvme_sq.sqhd_disabled) {
567 kfree(cmd->iov);
568 sgl_free(cmd->req.sg);
569 }
570
571 return 1;
572
573}
574
575static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
576 bool last_in_batch)
577{
578 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
579 int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
580 int flags = MSG_DONTWAIT;
581 int ret;
582
583 if (!last_in_batch && cmd->queue->send_list_len)
584 flags |= MSG_MORE;
585 else
586 flags |= MSG_EOR;
587
588 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
589 offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
590 if (ret <= 0)
591 return ret;
592 cmd->offset += ret;
593 left -= ret;
594
595 if (left)
596 return -EAGAIN;
597
598 kfree(cmd->iov);
599 sgl_free(cmd->req.sg);
600 cmd->queue->snd_cmd = NULL;
601 nvmet_tcp_put_cmd(cmd);
602 return 1;
603}
604
605static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
606{
607 u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
608 int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
609 int flags = MSG_DONTWAIT;
610 int ret;
611
612 if (!last_in_batch && cmd->queue->send_list_len)
613 flags |= MSG_MORE;
614 else
615 flags |= MSG_EOR;
616
617 ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
618 offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
619 if (ret <= 0)
620 return ret;
621 cmd->offset += ret;
622 left -= ret;
623
624 if (left)
625 return -EAGAIN;
626
627 cmd->queue->snd_cmd = NULL;
628 return 1;
629}
630
631static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd)
632{
633 struct nvmet_tcp_queue *queue = cmd->queue;
634 struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
635 struct kvec iov = {
636 .iov_base = &cmd->exp_ddgst + cmd->offset,
637 .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset
638 };
639 int ret;
640
641 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
642 if (unlikely(ret <= 0))
643 return ret;
644
645 cmd->offset += ret;
646
647 if (queue->nvme_sq.sqhd_disabled) {
648 cmd->queue->snd_cmd = NULL;
649 nvmet_tcp_put_cmd(cmd);
650 } else {
651 nvmet_setup_response_pdu(cmd);
652 }
653 return 1;
654}
655
656static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
657 bool last_in_batch)
658{
659 struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
660 int ret = 0;
661
662 if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
663 cmd = nvmet_tcp_fetch_cmd(queue);
664 if (unlikely(!cmd))
665 return 0;
666 }
667
668 if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
669 ret = nvmet_try_send_data_pdu(cmd);
670 if (ret <= 0)
671 goto done_send;
672 }
673
674 if (cmd->state == NVMET_TCP_SEND_DATA) {
Olivier Deprez0e641232021-09-23 10:07:05 +0200675 ret = nvmet_try_send_data(cmd, last_in_batch);
David Brazdil0f672f62019-12-10 10:32:29 +0000676 if (ret <= 0)
677 goto done_send;
678 }
679
680 if (cmd->state == NVMET_TCP_SEND_DDGST) {
681 ret = nvmet_try_send_ddgst(cmd);
682 if (ret <= 0)
683 goto done_send;
684 }
685
686 if (cmd->state == NVMET_TCP_SEND_R2T) {
687 ret = nvmet_try_send_r2t(cmd, last_in_batch);
688 if (ret <= 0)
689 goto done_send;
690 }
691
692 if (cmd->state == NVMET_TCP_SEND_RESPONSE)
693 ret = nvmet_try_send_response(cmd, last_in_batch);
694
695done_send:
696 if (ret < 0) {
697 if (ret == -EAGAIN)
698 return 0;
699 return ret;
700 }
701
702 return 1;
703}
704
705static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
706 int budget, int *sends)
707{
708 int i, ret = 0;
709
710 for (i = 0; i < budget; i++) {
711 ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
712 if (ret <= 0)
713 break;
714 (*sends)++;
715 }
716
717 return ret;
718}
719
720static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
721{
722 queue->offset = 0;
723 queue->left = sizeof(struct nvme_tcp_hdr);
724 queue->cmd = NULL;
725 queue->rcv_state = NVMET_TCP_RECV_PDU;
726}
727
728static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
729{
730 struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
731
732 ahash_request_free(queue->rcv_hash);
733 ahash_request_free(queue->snd_hash);
734 crypto_free_ahash(tfm);
735}
736
737static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
738{
739 struct crypto_ahash *tfm;
740
741 tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
742 if (IS_ERR(tfm))
743 return PTR_ERR(tfm);
744
745 queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
746 if (!queue->snd_hash)
747 goto free_tfm;
748 ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
749
750 queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
751 if (!queue->rcv_hash)
752 goto free_snd_hash;
753 ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
754
755 return 0;
756free_snd_hash:
757 ahash_request_free(queue->snd_hash);
758free_tfm:
759 crypto_free_ahash(tfm);
760 return -ENOMEM;
761}
762
763
764static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
765{
766 struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
767 struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
768 struct msghdr msg = {};
769 struct kvec iov;
770 int ret;
771
772 if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
773 pr_err("bad nvme-tcp pdu length (%d)\n",
774 le32_to_cpu(icreq->hdr.plen));
775 nvmet_tcp_fatal_error(queue);
776 }
777
778 if (icreq->pfv != NVME_TCP_PFV_1_0) {
779 pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
780 return -EPROTO;
781 }
782
783 if (icreq->hpda != 0) {
784 pr_err("queue %d: unsupported hpda %d\n", queue->idx,
785 icreq->hpda);
786 return -EPROTO;
787 }
788
789 queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
790 queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
791 if (queue->hdr_digest || queue->data_digest) {
792 ret = nvmet_tcp_alloc_crypto(queue);
793 if (ret)
794 return ret;
795 }
796
797 memset(icresp, 0, sizeof(*icresp));
798 icresp->hdr.type = nvme_tcp_icresp;
799 icresp->hdr.hlen = sizeof(*icresp);
800 icresp->hdr.pdo = 0;
801 icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
802 icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
Olivier Deprez0e641232021-09-23 10:07:05 +0200803 icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */
David Brazdil0f672f62019-12-10 10:32:29 +0000804 icresp->cpda = 0;
805 if (queue->hdr_digest)
806 icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
807 if (queue->data_digest)
808 icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
809
810 iov.iov_base = icresp;
811 iov.iov_len = sizeof(*icresp);
812 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
813 if (ret < 0)
814 goto free_crypto;
815
816 queue->state = NVMET_TCP_Q_LIVE;
817 nvmet_prepare_receive_pdu(queue);
818 return 0;
819free_crypto:
820 if (queue->hdr_digest || queue->data_digest)
821 nvmet_tcp_free_crypto(queue);
822 return ret;
823}
824
825static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
826 struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
827{
828 int ret;
829
830 /* recover the expected data transfer length */
831 req->data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
832
833 if (!nvme_is_write(cmd->req.cmd) ||
834 req->data_len > cmd->req.port->inline_data_size) {
835 nvmet_prepare_receive_pdu(queue);
836 return;
837 }
838
839 ret = nvmet_tcp_map_data(cmd);
840 if (unlikely(ret)) {
841 pr_err("queue %d: failed to map data\n", queue->idx);
842 nvmet_tcp_fatal_error(queue);
843 return;
844 }
845
846 queue->rcv_state = NVMET_TCP_RECV_DATA;
847 nvmet_tcp_map_pdu_iovec(cmd);
848 cmd->flags |= NVMET_TCP_F_INIT_FAILED;
849}
850
851static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
852{
853 struct nvme_tcp_data_pdu *data = &queue->pdu.data;
854 struct nvmet_tcp_cmd *cmd;
855
Olivier Deprez0e641232021-09-23 10:07:05 +0200856 if (likely(queue->nr_cmds))
857 cmd = &queue->cmds[data->ttag];
858 else
859 cmd = &queue->connect;
David Brazdil0f672f62019-12-10 10:32:29 +0000860
861 if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
862 pr_err("ttag %u unexpected data offset %u (expected %u)\n",
863 data->ttag, le32_to_cpu(data->data_offset),
864 cmd->rbytes_done);
865 /* FIXME: use path and transport errors */
866 nvmet_req_complete(&cmd->req,
867 NVME_SC_INVALID_FIELD | NVME_SC_DNR);
868 return -EPROTO;
869 }
870
871 cmd->pdu_len = le32_to_cpu(data->data_length);
872 cmd->pdu_recv = 0;
873 nvmet_tcp_map_pdu_iovec(cmd);
874 queue->cmd = cmd;
875 queue->rcv_state = NVMET_TCP_RECV_DATA;
876
877 return 0;
878}
879
880static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
881{
882 struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
883 struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
884 struct nvmet_req *req;
885 int ret;
886
887 if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
888 if (hdr->type != nvme_tcp_icreq) {
889 pr_err("unexpected pdu type (%d) before icreq\n",
890 hdr->type);
891 nvmet_tcp_fatal_error(queue);
892 return -EPROTO;
893 }
894 return nvmet_tcp_handle_icreq(queue);
895 }
896
897 if (hdr->type == nvme_tcp_h2c_data) {
898 ret = nvmet_tcp_handle_h2c_data_pdu(queue);
899 if (unlikely(ret))
900 return ret;
901 return 0;
902 }
903
904 queue->cmd = nvmet_tcp_get_cmd(queue);
905 if (unlikely(!queue->cmd)) {
906 /* This should never happen */
907 pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
908 queue->idx, queue->nr_cmds, queue->send_list_len,
909 nvme_cmd->common.opcode);
910 nvmet_tcp_fatal_error(queue);
911 return -ENOMEM;
912 }
913
914 req = &queue->cmd->req;
915 memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
916
917 if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
918 &queue->nvme_sq, &nvmet_tcp_ops))) {
919 pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
920 req->cmd, req->cmd->common.command_id,
921 req->cmd->common.opcode,
922 le32_to_cpu(req->cmd->common.dptr.sgl.length));
923
924 nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
925 return -EAGAIN;
926 }
927
928 ret = nvmet_tcp_map_data(queue->cmd);
929 if (unlikely(ret)) {
930 pr_err("queue %d: failed to map data\n", queue->idx);
931 if (nvmet_tcp_has_inline_data(queue->cmd))
932 nvmet_tcp_fatal_error(queue);
933 else
934 nvmet_req_complete(req, ret);
935 ret = -EAGAIN;
936 goto out;
937 }
938
939 if (nvmet_tcp_need_data_in(queue->cmd)) {
940 if (nvmet_tcp_has_inline_data(queue->cmd)) {
941 queue->rcv_state = NVMET_TCP_RECV_DATA;
942 nvmet_tcp_map_pdu_iovec(queue->cmd);
943 return 0;
944 }
945 /* send back R2T */
946 nvmet_tcp_queue_response(&queue->cmd->req);
947 goto out;
948 }
949
950 nvmet_req_execute(&queue->cmd->req);
951out:
952 nvmet_prepare_receive_pdu(queue);
953 return ret;
954}
955
956static const u8 nvme_tcp_pdu_sizes[] = {
957 [nvme_tcp_icreq] = sizeof(struct nvme_tcp_icreq_pdu),
958 [nvme_tcp_cmd] = sizeof(struct nvme_tcp_cmd_pdu),
959 [nvme_tcp_h2c_data] = sizeof(struct nvme_tcp_data_pdu),
960};
961
962static inline u8 nvmet_tcp_pdu_size(u8 type)
963{
964 size_t idx = type;
965
966 return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
967 nvme_tcp_pdu_sizes[idx]) ?
968 nvme_tcp_pdu_sizes[idx] : 0;
969}
970
971static inline bool nvmet_tcp_pdu_valid(u8 type)
972{
973 switch (type) {
974 case nvme_tcp_icreq:
975 case nvme_tcp_cmd:
976 case nvme_tcp_h2c_data:
977 /* fallthru */
978 return true;
979 }
980
981 return false;
982}
983
984static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
985{
986 struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
987 int len;
988 struct kvec iov;
989 struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
990
991recv:
992 iov.iov_base = (void *)&queue->pdu + queue->offset;
993 iov.iov_len = queue->left;
994 len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
995 iov.iov_len, msg.msg_flags);
996 if (unlikely(len < 0))
997 return len;
998
999 queue->offset += len;
1000 queue->left -= len;
1001 if (queue->left)
1002 return -EAGAIN;
1003
1004 if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
1005 u8 hdgst = nvmet_tcp_hdgst_len(queue);
1006
1007 if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
1008 pr_err("unexpected pdu type %d\n", hdr->type);
1009 nvmet_tcp_fatal_error(queue);
1010 return -EIO;
1011 }
1012
1013 if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
1014 pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
1015 return -EIO;
1016 }
1017
1018 queue->left = hdr->hlen - queue->offset + hdgst;
1019 goto recv;
1020 }
1021
1022 if (queue->hdr_digest &&
1023 nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
1024 nvmet_tcp_fatal_error(queue); /* fatal */
1025 return -EPROTO;
1026 }
1027
1028 if (queue->data_digest &&
1029 nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
1030 nvmet_tcp_fatal_error(queue); /* fatal */
1031 return -EPROTO;
1032 }
1033
1034 return nvmet_tcp_done_recv_pdu(queue);
1035}
1036
1037static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
1038{
1039 struct nvmet_tcp_queue *queue = cmd->queue;
1040
1041 nvmet_tcp_ddgst(queue->rcv_hash, cmd);
1042 queue->offset = 0;
1043 queue->left = NVME_TCP_DIGEST_LENGTH;
1044 queue->rcv_state = NVMET_TCP_RECV_DDGST;
1045}
1046
1047static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
1048{
1049 struct nvmet_tcp_cmd *cmd = queue->cmd;
1050 int ret;
1051
1052 while (msg_data_left(&cmd->recv_msg)) {
1053 ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
1054 cmd->recv_msg.msg_flags);
1055 if (ret <= 0)
1056 return ret;
1057
1058 cmd->pdu_recv += ret;
1059 cmd->rbytes_done += ret;
1060 }
1061
1062 nvmet_tcp_unmap_pdu_iovec(cmd);
1063
1064 if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1065 cmd->rbytes_done == cmd->req.transfer_len) {
1066 if (queue->data_digest) {
1067 nvmet_tcp_prep_recv_ddgst(cmd);
1068 return 0;
1069 }
1070 nvmet_req_execute(&cmd->req);
1071 }
1072
1073 nvmet_prepare_receive_pdu(queue);
1074 return 0;
1075}
1076
1077static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
1078{
1079 struct nvmet_tcp_cmd *cmd = queue->cmd;
1080 int ret;
1081 struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1082 struct kvec iov = {
1083 .iov_base = (void *)&cmd->recv_ddgst + queue->offset,
1084 .iov_len = queue->left
1085 };
1086
1087 ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1088 iov.iov_len, msg.msg_flags);
1089 if (unlikely(ret < 0))
1090 return ret;
1091
1092 queue->offset += ret;
1093 queue->left -= ret;
1094 if (queue->left)
1095 return -EAGAIN;
1096
1097 if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
1098 pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
1099 queue->idx, cmd->req.cmd->common.command_id,
1100 queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
1101 le32_to_cpu(cmd->exp_ddgst));
1102 nvmet_tcp_finish_cmd(cmd);
1103 nvmet_tcp_fatal_error(queue);
1104 ret = -EPROTO;
1105 goto out;
1106 }
1107
1108 if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1109 cmd->rbytes_done == cmd->req.transfer_len)
1110 nvmet_req_execute(&cmd->req);
1111 ret = 0;
1112out:
1113 nvmet_prepare_receive_pdu(queue);
1114 return ret;
1115}
1116
1117static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
1118{
1119 int result = 0;
1120
1121 if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
1122 return 0;
1123
1124 if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
1125 result = nvmet_tcp_try_recv_pdu(queue);
1126 if (result != 0)
1127 goto done_recv;
1128 }
1129
1130 if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
1131 result = nvmet_tcp_try_recv_data(queue);
1132 if (result != 0)
1133 goto done_recv;
1134 }
1135
1136 if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
1137 result = nvmet_tcp_try_recv_ddgst(queue);
1138 if (result != 0)
1139 goto done_recv;
1140 }
1141
1142done_recv:
1143 if (result < 0) {
1144 if (result == -EAGAIN)
1145 return 0;
1146 return result;
1147 }
1148 return 1;
1149}
1150
1151static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
1152 int budget, int *recvs)
1153{
1154 int i, ret = 0;
1155
1156 for (i = 0; i < budget; i++) {
1157 ret = nvmet_tcp_try_recv_one(queue);
1158 if (ret <= 0)
1159 break;
1160 (*recvs)++;
1161 }
1162
1163 return ret;
1164}
1165
1166static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
1167{
1168 spin_lock(&queue->state_lock);
1169 if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
1170 queue->state = NVMET_TCP_Q_DISCONNECTING;
1171 schedule_work(&queue->release_work);
1172 }
1173 spin_unlock(&queue->state_lock);
1174}
1175
1176static void nvmet_tcp_io_work(struct work_struct *w)
1177{
1178 struct nvmet_tcp_queue *queue =
1179 container_of(w, struct nvmet_tcp_queue, io_work);
1180 bool pending;
1181 int ret, ops = 0;
1182
1183 do {
1184 pending = false;
1185
1186 ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
1187 if (ret > 0) {
1188 pending = true;
1189 } else if (ret < 0) {
1190 if (ret == -EPIPE || ret == -ECONNRESET)
1191 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1192 else
1193 nvmet_tcp_fatal_error(queue);
1194 return;
1195 }
1196
1197 ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
1198 if (ret > 0) {
1199 /* transmitted message/data */
1200 pending = true;
1201 } else if (ret < 0) {
1202 if (ret == -EPIPE || ret == -ECONNRESET)
1203 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1204 else
1205 nvmet_tcp_fatal_error(queue);
1206 return;
1207 }
1208
1209 } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
1210
1211 /*
1212 * We exahusted our budget, requeue our selves
1213 */
1214 if (pending)
1215 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1216}
1217
1218static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
1219 struct nvmet_tcp_cmd *c)
1220{
1221 u8 hdgst = nvmet_tcp_hdgst_len(queue);
1222
1223 c->queue = queue;
1224 c->req.port = queue->port->nport;
1225
1226 c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
1227 sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1228 if (!c->cmd_pdu)
1229 return -ENOMEM;
1230 c->req.cmd = &c->cmd_pdu->cmd;
1231
1232 c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
1233 sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1234 if (!c->rsp_pdu)
1235 goto out_free_cmd;
1236 c->req.cqe = &c->rsp_pdu->cqe;
1237
1238 c->data_pdu = page_frag_alloc(&queue->pf_cache,
1239 sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1240 if (!c->data_pdu)
1241 goto out_free_rsp;
1242
1243 c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
1244 sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1245 if (!c->r2t_pdu)
1246 goto out_free_data;
1247
1248 c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1249
1250 list_add_tail(&c->entry, &queue->free_list);
1251
1252 return 0;
1253out_free_data:
1254 page_frag_free(c->data_pdu);
1255out_free_rsp:
1256 page_frag_free(c->rsp_pdu);
1257out_free_cmd:
1258 page_frag_free(c->cmd_pdu);
1259 return -ENOMEM;
1260}
1261
1262static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
1263{
1264 page_frag_free(c->r2t_pdu);
1265 page_frag_free(c->data_pdu);
1266 page_frag_free(c->rsp_pdu);
1267 page_frag_free(c->cmd_pdu);
1268}
1269
1270static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
1271{
1272 struct nvmet_tcp_cmd *cmds;
1273 int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
1274
1275 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
1276 if (!cmds)
1277 goto out;
1278
1279 for (i = 0; i < nr_cmds; i++) {
1280 ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
1281 if (ret)
1282 goto out_free;
1283 }
1284
1285 queue->cmds = cmds;
1286
1287 return 0;
1288out_free:
1289 while (--i >= 0)
1290 nvmet_tcp_free_cmd(cmds + i);
1291 kfree(cmds);
1292out:
1293 return ret;
1294}
1295
1296static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
1297{
1298 struct nvmet_tcp_cmd *cmds = queue->cmds;
1299 int i;
1300
1301 for (i = 0; i < queue->nr_cmds; i++)
1302 nvmet_tcp_free_cmd(cmds + i);
1303
1304 nvmet_tcp_free_cmd(&queue->connect);
1305 kfree(cmds);
1306}
1307
1308static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
1309{
1310 struct socket *sock = queue->sock;
1311
1312 write_lock_bh(&sock->sk->sk_callback_lock);
1313 sock->sk->sk_data_ready = queue->data_ready;
1314 sock->sk->sk_state_change = queue->state_change;
1315 sock->sk->sk_write_space = queue->write_space;
1316 sock->sk->sk_user_data = NULL;
1317 write_unlock_bh(&sock->sk->sk_callback_lock);
1318}
1319
1320static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
1321{
1322 nvmet_req_uninit(&cmd->req);
1323 nvmet_tcp_unmap_pdu_iovec(cmd);
1324 kfree(cmd->iov);
1325 sgl_free(cmd->req.sg);
1326}
1327
1328static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
1329{
1330 struct nvmet_tcp_cmd *cmd = queue->cmds;
1331 int i;
1332
1333 for (i = 0; i < queue->nr_cmds; i++, cmd++) {
1334 if (nvmet_tcp_need_data_in(cmd))
1335 nvmet_tcp_finish_cmd(cmd);
1336 }
1337
1338 if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
1339 /* failed in connect */
1340 nvmet_tcp_finish_cmd(&queue->connect);
1341 }
1342}
1343
1344static void nvmet_tcp_release_queue_work(struct work_struct *w)
1345{
1346 struct nvmet_tcp_queue *queue =
1347 container_of(w, struct nvmet_tcp_queue, release_work);
1348
1349 mutex_lock(&nvmet_tcp_queue_mutex);
1350 list_del_init(&queue->queue_list);
1351 mutex_unlock(&nvmet_tcp_queue_mutex);
1352
1353 nvmet_tcp_restore_socket_callbacks(queue);
1354 flush_work(&queue->io_work);
1355
1356 nvmet_tcp_uninit_data_in_cmds(queue);
1357 nvmet_sq_destroy(&queue->nvme_sq);
1358 cancel_work_sync(&queue->io_work);
1359 sock_release(queue->sock);
1360 nvmet_tcp_free_cmds(queue);
1361 if (queue->hdr_digest || queue->data_digest)
1362 nvmet_tcp_free_crypto(queue);
1363 ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1364
1365 kfree(queue);
1366}
1367
1368static void nvmet_tcp_data_ready(struct sock *sk)
1369{
1370 struct nvmet_tcp_queue *queue;
1371
1372 read_lock_bh(&sk->sk_callback_lock);
1373 queue = sk->sk_user_data;
1374 if (likely(queue))
1375 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1376 read_unlock_bh(&sk->sk_callback_lock);
1377}
1378
1379static void nvmet_tcp_write_space(struct sock *sk)
1380{
1381 struct nvmet_tcp_queue *queue;
1382
1383 read_lock_bh(&sk->sk_callback_lock);
1384 queue = sk->sk_user_data;
1385 if (unlikely(!queue))
1386 goto out;
1387
1388 if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
1389 queue->write_space(sk);
1390 goto out;
1391 }
1392
1393 if (sk_stream_is_writeable(sk)) {
1394 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1395 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1396 }
1397out:
1398 read_unlock_bh(&sk->sk_callback_lock);
1399}
1400
1401static void nvmet_tcp_state_change(struct sock *sk)
1402{
1403 struct nvmet_tcp_queue *queue;
1404
Olivier Deprez0e641232021-09-23 10:07:05 +02001405 read_lock_bh(&sk->sk_callback_lock);
David Brazdil0f672f62019-12-10 10:32:29 +00001406 queue = sk->sk_user_data;
1407 if (!queue)
1408 goto done;
1409
1410 switch (sk->sk_state) {
1411 case TCP_FIN_WAIT1:
1412 case TCP_CLOSE_WAIT:
1413 case TCP_CLOSE:
1414 /* FALLTHRU */
David Brazdil0f672f62019-12-10 10:32:29 +00001415 nvmet_tcp_schedule_release_queue(queue);
1416 break;
1417 default:
1418 pr_warn("queue %d unhandled state %d\n",
1419 queue->idx, sk->sk_state);
1420 }
1421done:
Olivier Deprez0e641232021-09-23 10:07:05 +02001422 read_unlock_bh(&sk->sk_callback_lock);
David Brazdil0f672f62019-12-10 10:32:29 +00001423}
1424
1425static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
1426{
1427 struct socket *sock = queue->sock;
1428 struct inet_sock *inet = inet_sk(sock->sk);
1429 struct linger sol = { .l_onoff = 1, .l_linger = 0 };
1430 int ret;
1431
1432 ret = kernel_getsockname(sock,
1433 (struct sockaddr *)&queue->sockaddr);
1434 if (ret < 0)
1435 return ret;
1436
1437 ret = kernel_getpeername(sock,
1438 (struct sockaddr *)&queue->sockaddr_peer);
1439 if (ret < 0)
1440 return ret;
1441
1442 /*
1443 * Cleanup whatever is sitting in the TCP transmit queue on socket
1444 * close. This is done to prevent stale data from being sent should
1445 * the network connection be restored before TCP times out.
1446 */
1447 ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
1448 (char *)&sol, sizeof(sol));
1449 if (ret)
1450 return ret;
1451
1452 /* Set socket type of service */
1453 if (inet->rcv_tos > 0) {
1454 int tos = inet->rcv_tos;
1455
1456 ret = kernel_setsockopt(sock, SOL_IP, IP_TOS,
1457 (char *)&tos, sizeof(tos));
1458 if (ret)
1459 return ret;
1460 }
1461
1462 write_lock_bh(&sock->sk->sk_callback_lock);
1463 sock->sk->sk_user_data = queue;
1464 queue->data_ready = sock->sk->sk_data_ready;
1465 sock->sk->sk_data_ready = nvmet_tcp_data_ready;
1466 queue->state_change = sock->sk->sk_state_change;
1467 sock->sk->sk_state_change = nvmet_tcp_state_change;
1468 queue->write_space = sock->sk->sk_write_space;
1469 sock->sk->sk_write_space = nvmet_tcp_write_space;
1470 write_unlock_bh(&sock->sk->sk_callback_lock);
1471
1472 return 0;
1473}
1474
1475static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
1476 struct socket *newsock)
1477{
1478 struct nvmet_tcp_queue *queue;
1479 int ret;
1480
1481 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1482 if (!queue)
1483 return -ENOMEM;
1484
1485 INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
1486 INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
1487 queue->sock = newsock;
1488 queue->port = port;
1489 queue->nr_cmds = 0;
1490 spin_lock_init(&queue->state_lock);
1491 queue->state = NVMET_TCP_Q_CONNECTING;
1492 INIT_LIST_HEAD(&queue->free_list);
1493 init_llist_head(&queue->resp_list);
1494 INIT_LIST_HEAD(&queue->resp_send_list);
1495
1496 queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL);
1497 if (queue->idx < 0) {
1498 ret = queue->idx;
1499 goto out_free_queue;
1500 }
1501
1502 ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
1503 if (ret)
1504 goto out_ida_remove;
1505
1506 ret = nvmet_sq_init(&queue->nvme_sq);
1507 if (ret)
1508 goto out_free_connect;
1509
1510 port->last_cpu = cpumask_next_wrap(port->last_cpu,
1511 cpu_online_mask, -1, false);
1512 queue->cpu = port->last_cpu;
1513 nvmet_prepare_receive_pdu(queue);
1514
1515 mutex_lock(&nvmet_tcp_queue_mutex);
1516 list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
1517 mutex_unlock(&nvmet_tcp_queue_mutex);
1518
1519 ret = nvmet_tcp_set_queue_sock(queue);
1520 if (ret)
1521 goto out_destroy_sq;
1522
1523 queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1524
1525 return 0;
1526out_destroy_sq:
1527 mutex_lock(&nvmet_tcp_queue_mutex);
1528 list_del_init(&queue->queue_list);
1529 mutex_unlock(&nvmet_tcp_queue_mutex);
1530 nvmet_sq_destroy(&queue->nvme_sq);
1531out_free_connect:
1532 nvmet_tcp_free_cmd(&queue->connect);
1533out_ida_remove:
1534 ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1535out_free_queue:
1536 kfree(queue);
1537 return ret;
1538}
1539
1540static void nvmet_tcp_accept_work(struct work_struct *w)
1541{
1542 struct nvmet_tcp_port *port =
1543 container_of(w, struct nvmet_tcp_port, accept_work);
1544 struct socket *newsock;
1545 int ret;
1546
1547 while (true) {
1548 ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
1549 if (ret < 0) {
1550 if (ret != -EAGAIN)
1551 pr_warn("failed to accept err=%d\n", ret);
1552 return;
1553 }
1554 ret = nvmet_tcp_alloc_queue(port, newsock);
1555 if (ret) {
1556 pr_err("failed to allocate queue\n");
1557 sock_release(newsock);
1558 }
1559 }
1560}
1561
1562static void nvmet_tcp_listen_data_ready(struct sock *sk)
1563{
1564 struct nvmet_tcp_port *port;
1565
1566 read_lock_bh(&sk->sk_callback_lock);
1567 port = sk->sk_user_data;
1568 if (!port)
1569 goto out;
1570
1571 if (sk->sk_state == TCP_LISTEN)
1572 schedule_work(&port->accept_work);
1573out:
1574 read_unlock_bh(&sk->sk_callback_lock);
1575}
1576
1577static int nvmet_tcp_add_port(struct nvmet_port *nport)
1578{
1579 struct nvmet_tcp_port *port;
1580 __kernel_sa_family_t af;
1581 int opt, ret;
1582
1583 port = kzalloc(sizeof(*port), GFP_KERNEL);
1584 if (!port)
1585 return -ENOMEM;
1586
1587 switch (nport->disc_addr.adrfam) {
1588 case NVMF_ADDR_FAMILY_IP4:
1589 af = AF_INET;
1590 break;
1591 case NVMF_ADDR_FAMILY_IP6:
1592 af = AF_INET6;
1593 break;
1594 default:
1595 pr_err("address family %d not supported\n",
1596 nport->disc_addr.adrfam);
1597 ret = -EINVAL;
1598 goto err_port;
1599 }
1600
1601 ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
1602 nport->disc_addr.trsvcid, &port->addr);
1603 if (ret) {
1604 pr_err("malformed ip/port passed: %s:%s\n",
1605 nport->disc_addr.traddr, nport->disc_addr.trsvcid);
1606 goto err_port;
1607 }
1608
1609 port->nport = nport;
1610 port->last_cpu = -1;
1611 INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
1612 if (port->nport->inline_data_size < 0)
1613 port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
1614
1615 ret = sock_create(port->addr.ss_family, SOCK_STREAM,
1616 IPPROTO_TCP, &port->sock);
1617 if (ret) {
1618 pr_err("failed to create a socket\n");
1619 goto err_port;
1620 }
1621
1622 port->sock->sk->sk_user_data = port;
1623 port->data_ready = port->sock->sk->sk_data_ready;
1624 port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
1625
1626 opt = 1;
1627 ret = kernel_setsockopt(port->sock, IPPROTO_TCP,
1628 TCP_NODELAY, (char *)&opt, sizeof(opt));
1629 if (ret) {
1630 pr_err("failed to set TCP_NODELAY sock opt %d\n", ret);
1631 goto err_sock;
1632 }
1633
1634 ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR,
1635 (char *)&opt, sizeof(opt));
1636 if (ret) {
1637 pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret);
1638 goto err_sock;
1639 }
1640
1641 ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
1642 sizeof(port->addr));
1643 if (ret) {
1644 pr_err("failed to bind port socket %d\n", ret);
1645 goto err_sock;
1646 }
1647
1648 ret = kernel_listen(port->sock, 128);
1649 if (ret) {
1650 pr_err("failed to listen %d on port sock\n", ret);
1651 goto err_sock;
1652 }
1653
1654 nport->priv = port;
1655 pr_info("enabling port %d (%pISpc)\n",
1656 le16_to_cpu(nport->disc_addr.portid), &port->addr);
1657
1658 return 0;
1659
1660err_sock:
1661 sock_release(port->sock);
1662err_port:
1663 kfree(port);
1664 return ret;
1665}
1666
1667static void nvmet_tcp_remove_port(struct nvmet_port *nport)
1668{
1669 struct nvmet_tcp_port *port = nport->priv;
1670
1671 write_lock_bh(&port->sock->sk->sk_callback_lock);
1672 port->sock->sk->sk_data_ready = port->data_ready;
1673 port->sock->sk->sk_user_data = NULL;
1674 write_unlock_bh(&port->sock->sk->sk_callback_lock);
1675 cancel_work_sync(&port->accept_work);
1676
1677 sock_release(port->sock);
1678 kfree(port);
1679}
1680
1681static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl)
1682{
1683 struct nvmet_tcp_queue *queue;
1684
1685 mutex_lock(&nvmet_tcp_queue_mutex);
1686 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1687 if (queue->nvme_sq.ctrl == ctrl)
1688 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1689 mutex_unlock(&nvmet_tcp_queue_mutex);
1690}
1691
1692static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
1693{
1694 struct nvmet_tcp_queue *queue =
1695 container_of(sq, struct nvmet_tcp_queue, nvme_sq);
1696
1697 if (sq->qid == 0) {
1698 /* Let inflight controller teardown complete */
1699 flush_scheduled_work();
1700 }
1701
1702 queue->nr_cmds = sq->size * 2;
1703 if (nvmet_tcp_alloc_cmds(queue))
1704 return NVME_SC_INTERNAL;
1705 return 0;
1706}
1707
1708static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
1709 struct nvmet_port *nport, char *traddr)
1710{
1711 struct nvmet_tcp_port *port = nport->priv;
1712
1713 if (inet_addr_is_any((struct sockaddr *)&port->addr)) {
1714 struct nvmet_tcp_cmd *cmd =
1715 container_of(req, struct nvmet_tcp_cmd, req);
1716 struct nvmet_tcp_queue *queue = cmd->queue;
1717
1718 sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr);
1719 } else {
1720 memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
1721 }
1722}
1723
1724static struct nvmet_fabrics_ops nvmet_tcp_ops = {
1725 .owner = THIS_MODULE,
1726 .type = NVMF_TRTYPE_TCP,
1727 .msdbd = 1,
1728 .has_keyed_sgls = 0,
1729 .add_port = nvmet_tcp_add_port,
1730 .remove_port = nvmet_tcp_remove_port,
1731 .queue_response = nvmet_tcp_queue_response,
1732 .delete_ctrl = nvmet_tcp_delete_ctrl,
1733 .install_queue = nvmet_tcp_install_queue,
1734 .disc_traddr = nvmet_tcp_disc_port_addr,
1735};
1736
1737static int __init nvmet_tcp_init(void)
1738{
1739 int ret;
1740
1741 nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0);
1742 if (!nvmet_tcp_wq)
1743 return -ENOMEM;
1744
1745 ret = nvmet_register_transport(&nvmet_tcp_ops);
1746 if (ret)
1747 goto err;
1748
1749 return 0;
1750err:
1751 destroy_workqueue(nvmet_tcp_wq);
1752 return ret;
1753}
1754
1755static void __exit nvmet_tcp_exit(void)
1756{
1757 struct nvmet_tcp_queue *queue;
1758
1759 nvmet_unregister_transport(&nvmet_tcp_ops);
1760
1761 flush_scheduled_work();
1762 mutex_lock(&nvmet_tcp_queue_mutex);
1763 list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1764 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1765 mutex_unlock(&nvmet_tcp_queue_mutex);
1766 flush_scheduled_work();
1767
1768 destroy_workqueue(nvmet_tcp_wq);
1769}
1770
1771module_init(nvmet_tcp_init);
1772module_exit(nvmet_tcp_exit);
1773
1774MODULE_LICENSE("GPL v2");
1775MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */