Update Linux to v5.4.148
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.4.148.tar.gz
Change-Id: Ib3d26c5ba9b022e2e03533005c4fed4d7c30b61b
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 2b36f05..cf0ae71 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -62,8 +62,10 @@
config NVME_TCP
tristate "NVM Express over Fabrics TCP host driver"
depends on INET
- depends on BLK_DEV_NVME
+ depends on BLOCK
+ select NVME_CORE
select NVME_FABRICS
+ select CRYPTO
select CRYPTO_CRC32C
help
This provides support for the NVMe over Fabrics protocol using
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fa7ba09..a5b5a23 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -6,6 +6,7 @@
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
+#include <linux/compat.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/hdreg.h>
@@ -66,8 +67,8 @@
* nvme_reset_wq - hosts nvme reset works
* nvme_delete_wq - hosts nvme delete works
*
- * nvme_wq will host works such are scan, aen handling, fw activation,
- * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq
+ * nvme_wq will host works such as scan, aen handling, fw activation,
+ * keep-alive, periodic reconnects etc. nvme_reset_wq
* runs reset works which also flush works hosted on nvme_wq for
* serialization purposes. nvme_delete_wq host controller deletion
* works which flush reset works for serialization.
@@ -287,11 +288,8 @@
nvme_req(req)->ctrl->comp_seen = true;
if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
- if ((req->cmd_flags & REQ_NVME_MPATH) &&
- blk_path_error(status)) {
- nvme_failover_req(req);
+ if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req))
return;
- }
if (!blk_queue_dying(req->q)) {
nvme_retry_req(req);
@@ -313,12 +311,33 @@
if (blk_mq_request_completed(req))
return true;
- nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR;
+ nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
+ nvme_req(req)->flags |= NVME_REQ_CANCELLED;
blk_mq_complete_request(req);
return true;
}
EXPORT_SYMBOL_GPL(nvme_cancel_request);
+void nvme_cancel_tagset(struct nvme_ctrl *ctrl)
+{
+ if (ctrl->tagset) {
+ blk_mq_tagset_busy_iter(ctrl->tagset,
+ nvme_cancel_request, ctrl);
+ blk_mq_tagset_wait_completed_request(ctrl->tagset);
+ }
+}
+EXPORT_SYMBOL_GPL(nvme_cancel_tagset);
+
+void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
+{
+ if (ctrl->admin_tagset) {
+ blk_mq_tagset_busy_iter(ctrl->admin_tagset,
+ nvme_cancel_request, ctrl);
+ blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
+ }
+}
+EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
+
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state)
{
@@ -437,7 +456,6 @@
nvme_mpath_remove_disk(head);
ida_simple_remove(&head->subsys->ns_ida, head->instance);
- list_del_init(&head->entry);
cleanup_srcu_struct(&head->srcu);
nvme_put_subsystem(head->subsys);
kfree(head);
@@ -611,8 +629,14 @@
struct nvme_dsm_range *range;
struct bio *bio;
- range = kmalloc_array(segments, sizeof(*range),
- GFP_ATOMIC | __GFP_NOWARN);
+ /*
+ * Some devices do not consider the DSM 'Number of Ranges' field when
+ * determining how much data to DMA. Always allocate memory for maximum
+ * number of segments to prevent device reading beyond end of buffer.
+ */
+ static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
+
+ range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
if (!range) {
/*
* If we fail allocation our range, fallback to the controller
@@ -626,7 +650,7 @@
}
__rq_for_each_bio(bio, req) {
- u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
+ u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
if (n < segments) {
@@ -652,7 +676,7 @@
req->special_vec.bv_page = virt_to_page(range);
req->special_vec.bv_offset = offset_in_page(range);
- req->special_vec.bv_len = sizeof(*range) * segments;
+ req->special_vec.bv_len = alloc_size;
req->rq_flags |= RQF_SPECIAL_PAYLOAD;
return BLK_STS_OK;
@@ -667,10 +691,13 @@
cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->write_zeroes.slba =
- cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+ cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
cmnd->write_zeroes.length =
cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
- cmnd->write_zeroes.control = 0;
+ if (nvme_ns_has_pi(ns))
+ cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT);
+ else
+ cmnd->write_zeroes.control = 0;
return BLK_STS_OK;
}
@@ -691,7 +718,7 @@
cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
- cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+ cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
if (req_op(req) == REQ_OP_WRITE && ctrl->nr_streams)
@@ -966,7 +993,7 @@
startka = true;
spin_unlock_irqrestore(&ctrl->lock, flags);
if (startka)
- schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+ queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
}
static int nvme_keep_alive(struct nvme_ctrl *ctrl)
@@ -996,7 +1023,7 @@
dev_dbg(ctrl->device,
"reschedule traffic based keep-alive timer\n");
ctrl->comp_seen = false;
- schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+ queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
return;
}
@@ -1013,7 +1040,7 @@
if (unlikely(ctrl->kato == 0))
return;
- schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+ queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
}
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
@@ -1025,6 +1052,19 @@
}
EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
+/*
+ * In NVMe 1.0 the CNS field was just a binary controller or namespace
+ * flag, thus sending any new CNS opcodes has a big chance of not working.
+ * Qemu unfortunately had that bug after reporting a 1.1 version compliance
+ * (but not for any later version).
+ */
+static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
+{
+ if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
+ return ctrl->vs < NVME_VS(1, 2, 0);
+ return ctrl->vs < NVME_VS(1, 1, 0);
+}
+
static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
{
struct nvme_command c = { };
@@ -1054,6 +1094,9 @@
int pos;
int len;
+ if (ctrl->quirks & NVME_QUIRK_NO_NS_DESC_LIST)
+ return 0;
+
c.identify.opcode = nvme_admin_identify;
c.identify.nsid = cpu_to_le32(nsid);
c.identify.cns = NVME_ID_CNS_NS_DESC_LIST;
@@ -1064,8 +1107,11 @@
status = nvme_submit_sync_cmd(ctrl->admin_q, &c, data,
NVME_IDENTIFY_DATA_SIZE);
- if (status)
+ if (status) {
+ dev_warn(ctrl->device,
+ "Identify Descriptors failed (%d)\n", status);
goto free_data;
+ }
for (pos = 0; pos < NVME_IDENTIFY_DATA_SIZE; pos += len) {
struct nvme_ns_id_desc *cur = data + pos;
@@ -1155,8 +1201,8 @@
static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid,
unsigned int dword11, void *buffer, size_t buflen, u32 *result)
{
+ union nvme_result res = { 0 };
struct nvme_command c;
- union nvme_result res;
int ret;
memset(&c, 0, sizeof(c));
@@ -1238,6 +1284,18 @@
queue_work(nvme_wq, &ctrl->async_event_work);
}
+/*
+ * Convert integer values from ioctl structures to user pointers, silently
+ * ignoring the upper bits in the compat case to match behaviour of 32-bit
+ * kernels.
+ */
+static void __user *nvme_to_user_ptr(uintptr_t ptrval)
+{
+ if (in_compat_syscall())
+ ptrval = (compat_uptr_t)ptrval;
+ return (void __user *)ptrval;
+}
+
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
{
struct nvme_user_io io;
@@ -1260,8 +1318,21 @@
}
length = (io.nblocks + 1) << ns->lba_shift;
- meta_len = (io.nblocks + 1) * ns->ms;
- metadata = (void __user *)(uintptr_t)io.metadata;
+
+ if ((io.control & NVME_RW_PRINFO_PRACT) &&
+ ns->ms == sizeof(struct t10_pi_tuple)) {
+ /*
+ * Protection information is stripped/inserted by the
+ * controller.
+ */
+ if (nvme_to_user_ptr(io.metadata))
+ return -EINVAL;
+ meta_len = 0;
+ metadata = NULL;
+ } else {
+ meta_len = (io.nblocks + 1) * ns->ms;
+ metadata = nvme_to_user_ptr(io.metadata);
+ }
if (ns->ext) {
length += meta_len;
@@ -1284,7 +1355,7 @@
c.rw.appmask = cpu_to_le16(io.appmask);
return nvme_submit_user_cmd(ns->queue, &c,
- (void __user *)(uintptr_t)io.addr, length,
+ nvme_to_user_ptr(io.addr), length,
metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
}
@@ -1404,9 +1475,9 @@
effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
- (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
- (void __user *)(uintptr_t)cmd.metadata,
- cmd.metadata_len, 0, &result, timeout);
+ nvme_to_user_ptr(cmd.addr), cmd.data_len,
+ nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
+ 0, &result, timeout);
nvme_passthru_end(ctrl, effects);
if (status >= 0) {
@@ -1451,8 +1522,8 @@
effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
- (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
- (void __user *)(uintptr_t)cmd.metadata, cmd.metadata_len,
+ nvme_to_user_ptr(cmd.addr), cmd.data_len,
+ nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
0, &cmd.result, timeout);
nvme_passthru_end(ctrl, effects);
@@ -1645,12 +1716,6 @@
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
-static void nvme_set_chunk_size(struct nvme_ns *ns)
-{
- u32 chunk_size = (((u32)ns->noiob) << (ns->lba_shift - 9));
- blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
-}
-
static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
{
struct nvme_ctrl *ctrl = ns->ctrl;
@@ -1682,53 +1747,32 @@
blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
}
-static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns)
+/*
+ * Even though NVMe spec explicitly states that MDTS is not applicable to the
+ * write-zeroes, we are cautious and limit the size to the controllers
+ * max_hw_sectors value, which is based on the MDTS field and possibly other
+ * limiting factors.
+ */
+static void nvme_config_write_zeroes(struct request_queue *q,
+ struct nvme_ctrl *ctrl)
{
- u32 max_sectors;
- unsigned short bs = 1 << ns->lba_shift;
-
- if (!(ns->ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) ||
- (ns->ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
- return;
- /*
- * Even though NVMe spec explicitly states that MDTS is not
- * applicable to the write-zeroes:- "The restriction does not apply to
- * commands that do not transfer data between the host and the
- * controller (e.g., Write Uncorrectable ro Write Zeroes command).".
- * In order to be more cautious use controller's max_hw_sectors value
- * to configure the maximum sectors for the write-zeroes which is
- * configured based on the controller's MDTS field in the
- * nvme_init_identify() if available.
- */
- if (ns->ctrl->max_hw_sectors == UINT_MAX)
- max_sectors = ((u32)(USHRT_MAX + 1) * bs) >> 9;
- else
- max_sectors = ((u32)(ns->ctrl->max_hw_sectors + 1) * bs) >> 9;
-
- blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors);
+ if ((ctrl->oncs & NVME_CTRL_ONCS_WRITE_ZEROES) &&
+ !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
+ blk_queue_max_write_zeroes_sectors(q, ctrl->max_hw_sectors);
}
static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
struct nvme_id_ns *id, struct nvme_ns_ids *ids)
{
- int ret = 0;
-
memset(ids, 0, sizeof(*ids));
if (ctrl->vs >= NVME_VS(1, 1, 0))
memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
if (ctrl->vs >= NVME_VS(1, 2, 0))
memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
- if (ctrl->vs >= NVME_VS(1, 3, 0)) {
- /* Don't treat error as fatal we potentially
- * already have a NGUID or EUI-64
- */
- ret = nvme_identify_ns_descs(ctrl, nsid, ids);
- if (ret)
- dev_warn(ctrl->device,
- "Identify Descriptors failed (%d)\n", ret);
- }
- return ret;
+ if (ctrl->vs >= NVME_VS(1, 3, 0))
+ return nvme_identify_ns_descs(ctrl, nsid, ids);
+ return 0;
}
static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
@@ -1748,7 +1792,7 @@
static void nvme_update_disk_info(struct gendisk *disk,
struct nvme_ns *ns, struct nvme_id_ns *id)
{
- sector_t capacity = le64_to_cpu(id->nsze) << (ns->lba_shift - 9);
+ sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
unsigned short bs = 1 << ns->lba_shift;
u32 atomic_bs, phys_bs, io_opt;
@@ -1801,7 +1845,7 @@
set_capacity(disk, capacity);
nvme_config_discard(disk, ns);
- nvme_config_write_zeroes(disk, ns);
+ nvme_config_write_zeroes(disk->queue, ns->ctrl);
if (id->nsattr & (1 << 0))
set_disk_ro(disk, true);
@@ -1814,6 +1858,7 @@
static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{
struct nvme_ns *ns = disk->private_data;
+ u32 iob;
/*
* If identify namespace failed, use default 512 byte block size so
@@ -1822,7 +1867,13 @@
ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
if (ns->lba_shift == 0)
ns->lba_shift = 9;
- ns->noiob = le16_to_cpu(id->noiob);
+
+ if ((ns->ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
+ is_power_of_2(ns->ctrl->max_hw_sectors))
+ iob = ns->ctrl->max_hw_sectors;
+ else
+ iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
+
ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
/* the PI implementation requires metadata equal t10 pi tuple size */
@@ -1831,14 +1882,14 @@
else
ns->pi_type = 0;
- if (ns->noiob)
- nvme_set_chunk_size(ns);
+ if (iob)
+ blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob));
nvme_update_disk_info(disk, ns, id);
#ifdef CONFIG_NVME_MULTIPATH
if (ns->head->disk) {
nvme_update_disk_info(ns->head->disk, ns, id);
blk_queue_stack_limits(ns->head->disk->queue, ns->queue);
- revalidate_disk(ns->head->disk);
+ nvme_mpath_update_disk_size(ns->head->disk);
}
#endif
}
@@ -2183,9 +2234,6 @@
blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
}
- if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
- is_power_of_2(ctrl->max_hw_sectors))
- blk_queue_chunk_sectors(q, ctrl->max_hw_sectors);
blk_queue_virt_boundary(q, ctrl->page_size - 1);
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
vwc = true;
@@ -2369,7 +2417,8 @@
if (ctrl->ps_max_latency_us != latency) {
ctrl->ps_max_latency_us = latency;
- nvme_configure_apst(ctrl);
+ if (ctrl->state == NVME_CTRL_LIVE)
+ nvme_configure_apst(ctrl);
}
}
@@ -2404,16 +2453,6 @@
.vid = 0x14a4,
.fr = "22301111",
.quirks = NVME_QUIRK_SIMPLE_SUSPEND,
- },
- {
- /*
- * This Kingston E8FK11.T firmware version has no interrupt
- * after resume with actions related to suspend to idle
- * https://bugzilla.kernel.org/show_bug.cgi?id=204887
- */
- .vid = 0x2646,
- .fr = "E8FK11.T",
- .quirks = NVME_QUIRK_SIMPLE_SUSPEND,
}
};
@@ -2868,7 +2907,7 @@
ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
}
- ret = nvme_mpath_init(ctrl, id);
+ ret = nvme_mpath_init_identify(ctrl, id);
kfree(id);
if (ret < 0)
@@ -2917,10 +2956,26 @@
return -EWOULDBLOCK;
}
+ nvme_get_ctrl(ctrl);
+ if (!try_module_get(ctrl->ops->module)) {
+ nvme_put_ctrl(ctrl);
+ return -EINVAL;
+ }
+
file->private_data = ctrl;
return 0;
}
+static int nvme_dev_release(struct inode *inode, struct file *file)
+{
+ struct nvme_ctrl *ctrl =
+ container_of(inode->i_cdev, struct nvme_ctrl, cdev);
+
+ module_put(ctrl->ops->module);
+ nvme_put_ctrl(ctrl);
+ return 0;
+}
+
static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
{
struct nvme_ns *ns;
@@ -2983,6 +3038,7 @@
static const struct file_operations nvme_dev_fops = {
.owner = THIS_MODULE,
.open = nvme_dev_open,
+ .release = nvme_dev_release,
.unlocked_ioctl = nvme_dev_ioctl,
.compat_ioctl = nvme_dev_ioctl,
};
@@ -3181,6 +3237,10 @@
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ /* Can't delete non-created controllers */
+ if (!ctrl->created)
+ return -EBUSY;
+
if (device_remove_file_self(dev, attr))
nvme_delete_ctrl_sync(ctrl);
return count;
@@ -3306,7 +3366,6 @@
list_for_each_entry(h, &subsys->nsheads, entry) {
if (nvme_ns_ids_valid(&new->ids) &&
- !list_empty(&h->list) &&
nvme_ns_ids_equal(&new->ids, &h->ids))
return -EINVAL;
}
@@ -3401,6 +3460,7 @@
"IDs don't match for shared namespace %d\n",
nsid);
ret = -EINVAL;
+ nvme_put_ns_head(head);
goto out_unlock;
}
}
@@ -3555,10 +3615,14 @@
return 0;
out_put_disk:
+ /* prevent double queue cleanup */
+ ns->disk->queue = NULL;
put_disk(ns->disk);
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
+ if (list_empty(&ns->head->list))
+ list_del_init(&ns->head->entry);
mutex_unlock(&ctrl->subsys->lock);
nvme_put_ns_head(ns->head);
out_free_id:
@@ -3581,7 +3645,10 @@
mutex_lock(&ns->ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
+ if (list_empty(&ns->head->list))
+ list_del_init(&ns->head->entry);
mutex_unlock(&ns->ctrl->subsys->lock);
+
synchronize_rcu(); /* guarantee not available in head->list */
nvme_mpath_clear_current_path(ns);
synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
@@ -3729,8 +3796,7 @@
mutex_lock(&ctrl->scan_lock);
nn = le32_to_cpu(id->nn);
- if (ctrl->vs >= NVME_VS(1, 1, 0) &&
- !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
+ if (!nvme_ctrl_limited_cns(ctrl)) {
if (!nvme_scan_ns_list(ctrl, nn))
goto out_free_id;
}
@@ -3855,7 +3921,7 @@
if (!log)
return;
- if (nvme_get_log(ctrl, NVME_NSID_ALL, 0, NVME_LOG_FW_SLOT, log,
+ if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log,
sizeof(*log), 0))
dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
kfree(log);
@@ -3975,6 +4041,7 @@
nvme_queue_scan(ctrl);
nvme_start_queues(ctrl);
}
+ ctrl->created = true;
}
EXPORT_SYMBOL_GPL(nvme_start_ctrl);
@@ -3992,7 +4059,7 @@
container_of(dev, struct nvme_ctrl, ctrl_device);
struct nvme_subsystem *subsys = ctrl->subsys;
- if (subsys && ctrl->instance != subsys->instance)
+ if (!subsys || ctrl->instance != subsys->instance)
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
kfree(ctrl->effects);
@@ -4065,6 +4132,7 @@
if (ret)
goto out_release_instance;
+ nvme_get_ctrl(ctrl);
cdev_init(&ctrl->cdev, &nvme_dev_fops);
ctrl->cdev.owner = ops->module;
ret = cdev_device_add(&ctrl->cdev, ctrl->device);
@@ -4080,9 +4148,11 @@
min(default_ps_max_latency_us, (unsigned long)S32_MAX));
nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
+ nvme_mpath_init_ctrl(ctrl);
return 0;
out_free_name:
+ nvme_put_ctrl(ctrl);
kfree_const(ctrl->device->kobj.name);
out_release_instance:
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
@@ -4128,7 +4198,7 @@
}
EXPORT_SYMBOL_GPL(nvme_unfreeze);
-void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
+int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
{
struct nvme_ns *ns;
@@ -4139,6 +4209,7 @@
break;
}
up_read(&ctrl->namespaces_rwsem);
+ return timeout;
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
@@ -4186,8 +4257,7 @@
}
EXPORT_SYMBOL_GPL(nvme_start_queues);
-
-void nvme_sync_queues(struct nvme_ctrl *ctrl)
+void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
@@ -4195,7 +4265,12 @@
list_for_each_entry(ns, &ctrl->namespaces, list)
blk_sync_queue(ns->queue);
up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
+void nvme_sync_queues(struct nvme_ctrl *ctrl)
+{
+ nvme_sync_io_queues(ctrl);
if (ctrl->admin_q)
blk_sync_queue(ctrl->admin_q);
}
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 74b8818..d884187 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -336,6 +336,11 @@
cmd->connect.recfmt);
break;
+ case NVME_SC_HOST_PATH_ERROR:
+ dev_err(ctrl->device,
+ "Connect command failed: host path error\n");
+ break;
+
default:
dev_err(ctrl->device,
"Connect command failed, error wo/DNR bit: %d\n",
@@ -565,10 +570,14 @@
struct nvme_request *req = nvme_req(rq);
/*
- * If we are in some state of setup or teardown only allow
- * internally generated commands.
+ * currently we have a problem sending passthru commands
+ * on the admin_q if the controller is not LIVE because we can't
+ * make sure that they are going out after the admin connect,
+ * controller enable and/or other commands in the initialization
+ * sequence. until the controller will be LIVE, fail with
+ * BLK_STS_RESOURCE so that they will be rescheduled.
*/
- if (!blk_rq_is_passthrough(rq) || (req->flags & NVME_REQ_USERCMD))
+ if (rq->q == ctrl->admin_q && (req->flags & NVME_REQ_USERCMD))
return false;
/*
@@ -576,9 +585,8 @@
* which is require to set the queue live in the appropinquate states.
*/
switch (ctrl->state) {
- case NVME_CTRL_NEW:
case NVME_CTRL_CONNECTING:
- if (nvme_is_fabrics(req->cmd) &&
+ if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) &&
req->cmd->fabrics.fctype == nvme_fabrics_type_connect)
return true;
break;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 265f89e..0d2c22c 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1608,7 +1608,7 @@
sizeof(op->rsp_iu), DMA_FROM_DEVICE);
if (opstate == FCPOP_STATE_ABORTED)
- status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
+ status = cpu_to_le16(NVME_SC_HOST_ABORTED_CMD << 1);
else if (freq->status) {
status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1);
dev_info(ctrl->ctrl.device,
@@ -1740,7 +1740,7 @@
if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.cmddma)) {
dev_err(ctrl->dev,
"FCP Op failed - cmdiu dma mapping failed.\n");
- ret = EFAULT;
+ ret = -EFAULT;
goto out_on_error;
}
@@ -1750,7 +1750,7 @@
if (fc_dma_mapping_error(ctrl->lport->dev, op->fcp_req.rspdma)) {
dev_err(ctrl->dev,
"FCP Op failed - rspiu dma mapping failed.\n");
- ret = EFAULT;
+ ret = -EFAULT;
}
atomic_set(&op->state, FCPOP_STATE_IDLE);
@@ -1820,6 +1820,7 @@
struct nvme_fc_fcp_op *aen_op;
int i;
+ cancel_work_sync(&ctrl->ctrl.async_event_work);
aen_op = ctrl->aen_ops;
for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) {
if (!aen_op->fcp_req.private)
@@ -2907,10 +2908,22 @@
static void
__nvme_fc_terminate_io(struct nvme_fc_ctrl *ctrl)
{
- nvme_stop_keep_alive(&ctrl->ctrl);
+ /*
+ * if state is connecting - the error occurred as part of a
+ * reconnect attempt. The create_association error paths will
+ * clean up any outstanding io.
+ *
+ * if it's a different state - ensure all pending io is
+ * terminated. Given this can delay while waiting for the
+ * aborted io to return, we recheck adapter state below
+ * before changing state.
+ */
+ if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
+ nvme_stop_keep_alive(&ctrl->ctrl);
- /* will block will waiting for io to terminate */
- nvme_fc_delete_association(ctrl);
+ /* will block will waiting for io to terminate */
+ nvme_fc_delete_association(ctrl);
+ }
if (ctrl->ctrl.state != NVME_CTRL_CONNECTING &&
!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
@@ -3158,10 +3171,7 @@
goto fail_ctrl;
}
- nvme_get_ctrl(&ctrl->ctrl);
-
if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) {
- nvme_put_ctrl(&ctrl->ctrl);
dev_err(ctrl->ctrl.device,
"NVME-FC{%d}: failed to schedule initial connect\n",
ctrl->cnum);
@@ -3186,6 +3196,7 @@
/* initiate nvme ctrl ref counting teardown */
nvme_uninit_ctrl(&ctrl->ctrl);
+ nvme_put_ctrl(&ctrl->ctrl);
/* Remove core ctrl ref. */
nvme_put_ctrl(&ctrl->ctrl);
@@ -3308,12 +3319,14 @@
spin_lock_irqsave(&nvme_fc_lock, flags);
list_for_each_entry(lport, &nvme_fc_lport_list, port_list) {
if (lport->localport.node_name != laddr.nn ||
- lport->localport.port_name != laddr.pn)
+ lport->localport.port_name != laddr.pn ||
+ lport->localport.port_state != FC_OBJSTATE_ONLINE)
continue;
list_for_each_entry(rport, &lport->endp_list, endp_list) {
if (rport->remoteport.node_name != raddr.nn ||
- rport->remoteport.port_name != raddr.pn)
+ rport->remoteport.port_name != raddr.pn ||
+ rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
continue;
/* if fail to get reference fall through. Will error */
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index e0f064d..590b040 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -3,6 +3,7 @@
* Copyright (c) 2017-2018 Christoph Hellwig.
*/
+#include <linux/backing-dev.h>
#include <linux/moduleparam.h>
#include <trace/events/block.h>
#include "nvme.h"
@@ -64,17 +65,12 @@
}
}
-void nvme_failover_req(struct request *req)
+bool nvme_failover_req(struct request *req)
{
struct nvme_ns *ns = req->q->queuedata;
u16 status = nvme_req(req)->status;
unsigned long flags;
- spin_lock_irqsave(&ns->head->requeue_lock, flags);
- blk_steal_bios(&ns->head->requeue_list, req);
- spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
- blk_mq_end_request(req, 0);
-
switch (status & 0x7ff) {
case NVME_SC_ANA_TRANSITION:
case NVME_SC_ANA_INACCESSIBLE:
@@ -95,6 +91,7 @@
}
break;
case NVME_SC_HOST_PATH_ERROR:
+ case NVME_SC_HOST_ABORTED_CMD:
/*
* Temporary transport disruption in talking to the controller.
* Try to send on a new path.
@@ -102,15 +99,17 @@
nvme_mpath_clear_current_path(ns);
break;
default:
- /*
- * Reset the controller for any non-ANA error as we don't know
- * what caused the error.
- */
- nvme_reset_ctrl(ns->ctrl);
- break;
+ /* This was a non-ANA error so follow the normal error path. */
+ return false;
}
+ spin_lock_irqsave(&ns->head->requeue_lock, flags);
+ blk_steal_bios(&ns->head->requeue_list, req);
+ spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
+ blk_mq_end_request(req, 0);
+
kblockd_schedule_work(&ns->head->requeue_work);
+ return true;
}
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
@@ -234,7 +233,7 @@
}
for (ns = nvme_next_ns(head, old);
- ns != old;
+ ns && ns != old;
ns = nvme_next_ns(head, ns)) {
if (nvme_path_is_disabled(ns))
continue;
@@ -247,6 +246,17 @@
fallback = ns;
}
+ /*
+ * The loop above skips the current path for round-robin semantics.
+ * Fall back to the current path if either:
+ * - no other optimized path found and current is optimized,
+ * - no other usable path found and current is usable.
+ */
+ if (!nvme_path_is_disabled(old) &&
+ (old->ana_state == NVME_ANA_OPTIMIZED ||
+ (!fallback && old->ana_state == NVME_ANA_NONOPTIMIZED)))
+ return old;
+
if (!fallback)
return NULL;
found = fallback;
@@ -267,10 +277,13 @@
struct nvme_ns *ns;
ns = srcu_dereference(head->current_path[node], &head->srcu);
- if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns)
- ns = nvme_round_robin_path(head, node, ns);
- if (unlikely(!ns || !nvme_path_is_optimized(ns)))
- ns = __nvme_find_path(head, node);
+ if (unlikely(!ns))
+ return __nvme_find_path(head, node);
+
+ if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
+ return nvme_round_robin_path(head, node, ns);
+ if (unlikely(!nvme_path_is_optimized(ns)))
+ return __nvme_find_path(head, node);
return ns;
}
@@ -317,7 +330,7 @@
trace_block_bio_remap(bio->bi_disk->queue, bio,
disk_devt(ns->head->disk),
bio->bi_iter.bi_sector);
- ret = direct_make_request(bio);
+ ret = generic_make_request(bio);
} else if (nvme_available_path(head)) {
dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
@@ -412,15 +425,14 @@
{
struct nvme_ns_head *head = ns->head;
- lockdep_assert_held(&ns->head->lock);
-
if (!head->disk)
return;
- if (!(head->disk->flags & GENHD_FL_UP))
+ if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
device_add_disk(&head->subsys->dev, head->disk,
nvme_ns_id_attr_groups);
+ mutex_lock(&head->lock);
if (nvme_path_is_optimized(ns)) {
int node, srcu_idx;
@@ -429,9 +441,10 @@
__nvme_find_path(head, node);
srcu_read_unlock(&head->srcu, srcu_idx);
}
+ mutex_unlock(&head->lock);
- synchronize_srcu(&ns->head->srcu);
- kblockd_schedule_work(&ns->head->requeue_work);
+ synchronize_srcu(&head->srcu);
+ kblockd_schedule_work(&head->requeue_work);
}
static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
@@ -482,14 +495,12 @@
static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
struct nvme_ns *ns)
{
- mutex_lock(&ns->head->lock);
ns->ana_grpid = le32_to_cpu(desc->grpid);
ns->ana_state = desc->state;
clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
if (nvme_state_is_live(ns->ana_state))
nvme_mpath_set_live(ns);
- mutex_unlock(&ns->head->lock);
}
static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
@@ -509,7 +520,7 @@
if (!nr_nsids)
return 0;
- down_write(&ctrl->namespaces_rwsem);
+ down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
unsigned nsid = le32_to_cpu(desc->nsids[n]);
@@ -520,7 +531,7 @@
if (++n == nr_nsids)
break;
}
- up_write(&ctrl->namespaces_rwsem);
+ up_read(&ctrl->namespaces_rwsem);
return 0;
}
@@ -639,31 +650,49 @@
}
DEVICE_ATTR_RO(ana_state);
-static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl,
+static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
struct nvme_ana_group_desc *desc, void *data)
{
- struct nvme_ns *ns = data;
+ struct nvme_ana_group_desc *dst = data;
- if (ns->ana_grpid == le32_to_cpu(desc->grpid)) {
- nvme_update_ns_ana_state(desc, ns);
- return -ENXIO; /* just break out of the loop */
- }
+ if (desc->grpid != dst->grpid)
+ return 0;
- return 0;
+ *dst = *desc;
+ return -ENXIO; /* just break out of the loop */
}
void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
{
if (nvme_ctrl_use_ana(ns->ctrl)) {
+ struct nvme_ana_group_desc desc = {
+ .grpid = id->anagrpid,
+ .state = 0,
+ };
+
mutex_lock(&ns->ctrl->ana_lock);
ns->ana_grpid = le32_to_cpu(id->anagrpid);
- nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state);
+ nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
mutex_unlock(&ns->ctrl->ana_lock);
+ if (desc.state) {
+ /* found the group desc: update */
+ nvme_update_ns_ana_state(&desc, ns);
+ } else {
+ /* group desc not found: trigger a re-read */
+ set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+ queue_work(nvme_wq, &ns->ctrl->ana_work);
+ }
} else {
- mutex_lock(&ns->head->lock);
ns->ana_state = NVME_ANA_OPTIMIZED;
nvme_mpath_set_live(ns);
- mutex_unlock(&ns->head->lock);
+ }
+
+ if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) {
+ struct gendisk *disk = ns->head->disk;
+
+ if (disk)
+ disk->queue->backing_dev_info->capabilities |=
+ BDI_CAP_STABLE_WRITES;
}
}
@@ -678,12 +707,29 @@
kblockd_schedule_work(&head->requeue_work);
flush_work(&head->requeue_work);
blk_cleanup_queue(head->disk->queue);
+ if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
+ /*
+ * if device_add_disk wasn't called, prevent
+ * disk release to put a bogus reference on the
+ * request queue
+ */
+ head->disk->queue = NULL;
+ }
put_disk(head->disk);
}
-int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
{
- int error;
+ mutex_init(&ctrl->ana_lock);
+ timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
+ INIT_WORK(&ctrl->ana_work, nvme_ana_work);
+}
+
+int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
+{
+ size_t max_transfer_size = ctrl->max_hw_sectors << SECTOR_SHIFT;
+ size_t ana_log_size;
+ int error = 0;
/* check if multipath is enabled and we have the capability */
if (!multipath || !ctrl->subsys || !(ctrl->subsys->cmic & (1 << 3)))
@@ -694,36 +740,31 @@
ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
- mutex_init(&ctrl->ana_lock);
- timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
- ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
- ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
- ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
-
- if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
+ ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
+ ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
+ ctrl->max_namespaces * sizeof(__le32);
+ if (ana_log_size > max_transfer_size) {
dev_err(ctrl->device,
- "ANA log page size (%zd) larger than MDTS (%d).\n",
- ctrl->ana_log_size,
- ctrl->max_hw_sectors << SECTOR_SHIFT);
+ "ANA log page size (%zd) larger than MDTS (%zd).\n",
+ ana_log_size, max_transfer_size);
dev_err(ctrl->device, "disabling ANA support.\n");
- return 0;
+ goto out_uninit;
}
-
- INIT_WORK(&ctrl->ana_work, nvme_ana_work);
- ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
- if (!ctrl->ana_log_buf) {
- error = -ENOMEM;
- goto out;
+ if (ana_log_size > ctrl->ana_log_size) {
+ nvme_mpath_stop(ctrl);
+ kfree(ctrl->ana_log_buf);
+ ctrl->ana_log_buf = kmalloc(ana_log_size, GFP_KERNEL);
+ if (!ctrl->ana_log_buf)
+ return -ENOMEM;
}
-
+ ctrl->ana_log_size = ana_log_size;
error = nvme_read_ana_log(ctrl);
if (error)
- goto out_free_ana_log_buf;
+ goto out_uninit;
return 0;
-out_free_ana_log_buf:
- kfree(ctrl->ana_log_buf);
- ctrl->ana_log_buf = NULL;
-out:
+
+out_uninit:
+ nvme_mpath_uninit(ctrl);
return error;
}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 22e8401..2df90d4 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -115,6 +115,13 @@
* Prevent tag overlap between queues
*/
NVME_QUIRK_SHARED_TAGS = (1 << 13),
+
+ /*
+ * The controller doesn't handle the Identify Namespace
+ * Identification Descriptor list subcommand despite claiming
+ * NVMe 1.3 compliance.
+ */
+ NVME_QUIRK_NO_NS_DESC_LIST = (1 << 15),
};
/*
@@ -246,6 +253,7 @@
struct nvme_command ka_cmd;
struct work_struct fw_act_work;
unsigned long events;
+ bool created;
#ifdef CONFIG_NVME_MULTIPATH
/* asymmetric namespace access: */
@@ -345,6 +353,8 @@
spinlock_t requeue_lock;
struct work_struct requeue_work;
struct mutex lock;
+ unsigned long flags;
+#define NVME_NSHEAD_DISK_LIVE 0
struct nvme_ns __rcu *current_path[];
#endif
};
@@ -374,7 +384,6 @@
#define NVME_NS_REMOVING 0
#define NVME_NS_DEAD 1
#define NVME_NS_ANA_PENDING 2
- u16 noiob;
struct nvme_fault_inject fault_inject;
@@ -419,9 +428,20 @@
return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
}
-static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
+/*
+ * Convert a 512B sector number to a device logical block number.
+ */
+static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
{
- return (sector >> (ns->lba_shift - 9));
+ return sector >> (ns->lba_shift - SECTOR_SHIFT);
+}
+
+/*
+ * Convert a device logical block number to a 512B sector number.
+ */
+static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba)
+{
+ return lba << (ns->lba_shift - SECTOR_SHIFT);
}
static inline void nvme_end_request(struct request *req, __le16 status,
@@ -448,6 +468,8 @@
void nvme_complete_rq(struct request *req);
bool nvme_cancel_request(struct request *req, void *data, bool reserved);
+void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
+void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state);
bool nvme_wait_reset(struct nvme_ctrl *ctrl);
@@ -474,9 +496,10 @@
void nvme_start_queues(struct nvme_ctrl *ctrl);
void nvme_kill_queues(struct nvme_ctrl *ctrl);
void nvme_sync_queues(struct nvme_ctrl *ctrl);
+void nvme_sync_io_queues(struct nvme_ctrl *ctrl);
void nvme_unfreeze(struct nvme_ctrl *ctrl);
void nvme_wait_freeze(struct nvme_ctrl *ctrl);
-void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
+int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
void nvme_start_freeze(struct nvme_ctrl *ctrl);
#define NVME_QID_ANY -1
@@ -521,12 +544,13 @@
void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
struct nvme_ctrl *ctrl, int *flags);
-void nvme_failover_req(struct request *req);
+bool nvme_failover_req(struct request *req);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
-int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
+int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
+void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl);
void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
void nvme_mpath_stop(struct nvme_ctrl *ctrl);
bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
@@ -551,6 +575,16 @@
req->bio, status);
}
+static inline void nvme_mpath_update_disk_size(struct gendisk *disk)
+{
+ struct block_device *bdev = bdget_disk(disk, 0);
+
+ if (bdev) {
+ bd_set_size(bdev, get_capacity(disk) << SECTOR_SHIFT);
+ bdput(bdev);
+ }
+}
+
extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute subsys_attr_iopolicy;
@@ -570,8 +604,9 @@
sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
}
-static inline void nvme_failover_req(struct request *req)
+static inline bool nvme_failover_req(struct request *req)
{
+ return false;
}
static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{
@@ -602,7 +637,10 @@
blk_status_t status)
{
}
-static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
+static inline void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
+{
+}
+static inline int nvme_mpath_init_identify(struct nvme_ctrl *ctrl,
struct nvme_id_ctrl *id)
{
if (ctrl->subsys->cmic & (1 << 3))
@@ -625,6 +663,9 @@
static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
{
}
+static inline void nvme_mpath_update_disk_size(struct gendisk *disk)
+{
+}
#endif /* CONFIG_NVME_MULTIPATH */
#ifdef CONFIG_NVM
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 869f462..af516c3 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -68,14 +68,14 @@
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
-static int write_queues;
-module_param(write_queues, int, 0644);
+static unsigned int write_queues;
+module_param(write_queues, uint, 0644);
MODULE_PARM_DESC(write_queues,
"Number of queues to use for writes. If not set, reads and writes "
"will share a queue set.");
-static int poll_queues;
-module_param(poll_queues, int, 0644);
+static unsigned int poll_queues;
+module_param(poll_queues, uint, 0644);
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
struct nvme_dev;
@@ -128,6 +128,9 @@
dma_addr_t host_mem_descs_dma;
struct nvme_host_mem_buf_desc *host_mem_descs;
void **host_mem_desc_bufs;
+ unsigned int nr_allocated_queues;
+ unsigned int nr_write_queues;
+ unsigned int nr_poll_queues;
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -167,7 +170,6 @@
/* only used for poll queues: */
spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
volatile struct nvme_completion *cqes;
- struct blk_mq_tags **tags;
dma_addr_t sq_dma_addr;
dma_addr_t cq_dma_addr;
u32 __iomem *q_db;
@@ -211,25 +213,14 @@
struct scatterlist *sg;
};
-static unsigned int max_io_queues(void)
+static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
{
- return num_possible_cpus() + write_queues + poll_queues;
-}
-
-static unsigned int max_queue_count(void)
-{
- /* IO queues + admin queue */
- return 1 + max_io_queues();
-}
-
-static inline unsigned int nvme_dbbuf_size(u32 stride)
-{
- return (max_queue_count() * 8 * stride);
+ return dev->nr_allocated_queues * 8 * dev->db_stride;
}
static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
- unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+ unsigned int mem_size = nvme_dbbuf_size(dev);
if (dev->dbbuf_dbs)
return 0;
@@ -254,7 +245,7 @@
static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
{
- unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
+ unsigned int mem_size = nvme_dbbuf_size(dev);
if (dev->dbbuf_dbs) {
dma_free_coherent(dev->dev, mem_size,
@@ -280,9 +271,21 @@
nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
}
+static void nvme_dbbuf_free(struct nvme_queue *nvmeq)
+{
+ if (!nvmeq->qid)
+ return;
+
+ nvmeq->dbbuf_sq_db = NULL;
+ nvmeq->dbbuf_cq_db = NULL;
+ nvmeq->dbbuf_sq_ei = NULL;
+ nvmeq->dbbuf_cq_ei = NULL;
+}
+
static void nvme_dbbuf_set(struct nvme_dev *dev)
{
struct nvme_command c;
+ unsigned int i;
if (!dev->dbbuf_dbs)
return;
@@ -296,6 +299,9 @@
dev_warn(dev->ctrl.device, "unable to set dbbuf\n");
/* Free memory and continue on */
nvme_dbbuf_dma_free(dev);
+
+ for (i = 1; i <= dev->online_queues; i++)
+ nvme_dbbuf_free(&dev->queues[i]);
}
}
@@ -377,29 +383,17 @@
WARN_ON(hctx_idx != 0);
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
- WARN_ON(nvmeq->tags);
hctx->driver_data = nvmeq;
- nvmeq->tags = &dev->admin_tagset.tags[0];
return 0;
}
-static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
-{
- struct nvme_queue *nvmeq = hctx->driver_data;
-
- nvmeq->tags = NULL;
-}
-
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
- if (!nvmeq->tags)
- nvmeq->tags = &dev->tagset.tags[hctx_idx];
-
WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
hctx->driver_data = nvmeq;
return 0;
@@ -534,50 +528,71 @@
return true;
}
-static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
+static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
- dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ dma_addr_t dma_addr = iod->first_dma;
int i;
- if (iod->dma_len) {
- dma_unmap_page(dev->dev, dma_addr, iod->dma_len,
- rq_dma_dir(req));
- return;
+ for (i = 0; i < iod->npages; i++) {
+ __le64 *prp_list = nvme_pci_iod_list(req)[i];
+ dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
+
+ dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
+ dma_addr = next_dma_addr;
}
- WARN_ON_ONCE(!iod->nents);
+}
+
+static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
+{
+ const int last_sg = SGES_PER_PAGE - 1;
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ dma_addr_t dma_addr = iod->first_dma;
+ int i;
+
+ for (i = 0; i < iod->npages; i++) {
+ struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i];
+ dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr);
+
+ dma_pool_free(dev->prp_page_pool, sg_list, dma_addr);
+ dma_addr = next_dma_addr;
+ }
+
+}
+
+static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
if (is_pci_p2pdma_page(sg_page(iod->sg)))
pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
rq_dma_dir(req));
else
dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
+}
+static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- if (iod->npages == 0)
- dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
- dma_addr);
-
- for (i = 0; i < iod->npages; i++) {
- void *addr = nvme_pci_iod_list(req)[i];
-
- if (iod->use_sgl) {
- struct nvme_sgl_desc *sg_list = addr;
-
- next_dma_addr =
- le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr);
- } else {
- __le64 *prp_list = addr;
-
- next_dma_addr = le64_to_cpu(prp_list[last_prp]);
- }
-
- dma_pool_free(dev->prp_page_pool, addr, dma_addr);
- dma_addr = next_dma_addr;
+ if (iod->dma_len) {
+ dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
+ rq_dma_dir(req));
+ return;
}
+ WARN_ON_ONCE(!iod->nents);
+
+ nvme_unmap_sg(dev, req);
+ if (iod->npages == 0)
+ dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
+ iod->first_dma);
+ else if (iod->use_sgl)
+ nvme_free_sgls(dev, req);
+ else
+ nvme_free_prps(dev, req);
mempool_free(iod->sg, dev->iod_mempool);
}
@@ -654,7 +669,7 @@
__le64 *old_prp_list = prp_list;
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
if (!prp_list)
- return BLK_STS_RESOURCE;
+ goto free_prps;
list[iod->npages++] = prp_list;
prp_list[0] = old_prp_list[i - 1];
old_prp_list[i - 1] = cpu_to_le64(prp_dma);
@@ -674,14 +689,14 @@
dma_addr = sg_dma_address(sg);
dma_len = sg_dma_len(sg);
}
-
done:
cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
-
return BLK_STS_OK;
-
- bad_sgl:
+free_prps:
+ nvme_free_prps(dev, req);
+ return BLK_STS_RESOURCE;
+bad_sgl:
WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents),
"Invalid SGL for payload:%d nents:%d\n",
blk_rq_payload_bytes(req), iod->nents);
@@ -753,7 +768,7 @@
sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
if (!sg_list)
- return BLK_STS_RESOURCE;
+ goto free_sgls;
i = 0;
nvme_pci_iod_list(req)[iod->npages++] = sg_list;
@@ -766,6 +781,9 @@
} while (--entries > 0);
return BLK_STS_OK;
+free_sgls:
+ nvme_free_sgls(dev, req);
+ return BLK_STS_RESOURCE;
}
static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
@@ -820,7 +838,7 @@
return nvme_setup_prp_simple(dev, req,
&cmnd->rw, &bv);
- if (iod->nvmeq->qid &&
+ if (iod->nvmeq->qid && sgl_threshold &&
dev->ctrl.sgls & ((1 << 0) | (1 << 1)))
return nvme_setup_sgl_simple(dev, req,
&cmnd->rw, &bv);
@@ -834,7 +852,7 @@
sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
if (!iod->nents)
- goto out;
+ goto out_free_sg;
if (is_pci_p2pdma_page(sg_page(iod->sg)))
nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg,
@@ -843,16 +861,21 @@
nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
rq_dma_dir(req), DMA_ATTR_NO_WARN);
if (!nr_mapped)
- goto out;
+ goto out_free_sg;
iod->use_sgl = nvme_pci_use_sgls(dev, req);
if (iod->use_sgl)
ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
else
ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
-out:
if (ret != BLK_STS_OK)
- nvme_unmap_data(dev, req);
+ goto out_unmap_sg;
+ return BLK_STS_OK;
+
+out_unmap_sg:
+ nvme_unmap_sg(dev, req);
+out_free_sg:
+ mempool_free(iod->sg, dev->iod_mempool);
return ret;
}
@@ -950,18 +973,18 @@
writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
}
+static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
+{
+ if (!nvmeq->qid)
+ return nvmeq->dev->admin_tagset.tags[0];
+ return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
+}
+
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
{
volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];
struct request *req;
- if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
- dev_warn(nvmeq->dev->ctrl.device,
- "invalid id %d completed on queue %d\n",
- cqe->command_id, le16_to_cpu(cqe->sq_id));
- return;
- }
-
/*
* AEN requests are special as they don't time out and can
* survive any kind of queue freeze and often don't respond to
@@ -975,7 +998,14 @@
return;
}
- req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
+ req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
+ if (unlikely(!req)) {
+ dev_warn(nvmeq->dev->ctrl.device,
+ "invalid id %d completed on queue %d\n",
+ cqe->command_id, le16_to_cpu(cqe->sq_id));
+ return;
+ }
+
trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
nvme_end_request(req, cqe->status, cqe->result);
}
@@ -1090,9 +1120,9 @@
spin_lock(&nvmeq->cq_poll_lock);
found = nvme_process_cq(nvmeq, &start, &end, -1);
+ nvme_complete_cqes(nvmeq, start, end);
spin_unlock(&nvmeq->cq_poll_lock);
- nvme_complete_cqes(nvmeq, start, end);
return found;
}
@@ -1288,8 +1318,8 @@
dev_warn_ratelimited(dev->ctrl.device,
"I/O %d QID %d timeout, disable controller\n",
req->tag, nvmeq->qid);
- nvme_dev_disable(dev, true);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+ nvme_dev_disable(dev, true);
return BLK_EH_DONE;
case NVME_CTRL_RESETTING:
return BLK_EH_RESET_TIMER;
@@ -1306,10 +1336,10 @@
dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, reset controller\n",
req->tag, nvmeq->qid);
+ nvme_req(req)->flags |= NVME_REQ_CANCELLED;
nvme_dev_disable(dev, false);
nvme_reset_ctrl(&dev->ctrl);
- nvme_req(req)->flags |= NVME_REQ_CANCELLED;
return BLK_EH_DONE;
}
@@ -1413,6 +1443,23 @@
nvme_poll_irqdisable(nvmeq, -1);
}
+/*
+ * Called only on a device that has been disabled and after all other threads
+ * that can check this device's completion queues have synced. This is the
+ * last chance for the driver to see a natural completion before
+ * nvme_cancel_request() terminates all incomplete requests.
+ */
+static void nvme_reap_pending_cqes(struct nvme_dev *dev)
+{
+ u16 start, end;
+ int i;
+
+ for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
+ nvme_process_cq(&dev->queues[i], &start, &end, -1);
+ nvme_complete_cqes(&dev->queues[i], start, end);
+ }
+}
+
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
int entry_size)
{
@@ -1578,7 +1625,6 @@
.queue_rq = nvme_queue_rq,
.complete = nvme_pci_complete_rq,
.init_hctx = nvme_admin_init_hctx,
- .exit_hctx = nvme_admin_exit_hctx,
.init_request = nvme_init_request,
.timeout = nvme_timeout,
};
@@ -2020,7 +2066,7 @@
static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
{
struct nvme_dev *dev = affd->priv;
- unsigned int nr_read_queues;
+ unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues;
/*
* If there is no interupt available for queues, ensure that
@@ -2036,12 +2082,12 @@
if (!nrirqs) {
nrirqs = 1;
nr_read_queues = 0;
- } else if (nrirqs == 1 || !write_queues) {
+ } else if (nrirqs == 1 || !nr_write_queues) {
nr_read_queues = 0;
- } else if (write_queues >= nrirqs) {
+ } else if (nr_write_queues >= nrirqs) {
nr_read_queues = 1;
} else {
- nr_read_queues = nrirqs - write_queues;
+ nr_read_queues = nrirqs - nr_write_queues;
}
dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
@@ -2060,21 +2106,17 @@
.priv = dev,
};
unsigned int irq_queues, this_p_queues;
- unsigned int nr_cpus = num_possible_cpus();
/*
* Poll queues don't need interrupts, but we need at least one IO
* queue left over for non-polled IO.
*/
- this_p_queues = poll_queues;
+ this_p_queues = dev->nr_poll_queues;
if (this_p_queues >= nr_io_queues) {
this_p_queues = nr_io_queues - 1;
irq_queues = 1;
} else {
- if (nr_cpus < nr_io_queues - this_p_queues)
- irq_queues = nr_cpus + 1;
- else
- irq_queues = nr_io_queues - this_p_queues + 1;
+ irq_queues = nr_io_queues - this_p_queues + 1;
}
dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
@@ -2099,14 +2141,25 @@
__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
}
+static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
+{
+ return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
+}
+
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
struct nvme_queue *adminq = &dev->queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
- int result, nr_io_queues;
+ unsigned int nr_io_queues;
unsigned long size;
+ int result;
- nr_io_queues = max_io_queues();
+ /*
+ * Sample the module parameters once at reset time so that we have
+ * stable values to work with.
+ */
+ dev->nr_write_queues = write_queues;
+ dev->nr_poll_queues = poll_queues;
/*
* If tags are shared with admin queue (Apple bug), then
@@ -2114,6 +2167,9 @@
*/
if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
nr_io_queues = 1;
+ else
+ nr_io_queues = min(nvme_max_io_queues(dev),
+ dev->nr_allocated_queues - 1);
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
if (result < 0)
@@ -2252,11 +2308,6 @@
if (timeout == 0)
return false;
- /* handle any remaining CQEs */
- if (opcode == nvme_admin_delete_cq &&
- !test_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags))
- nvme_poll_irqdisable(nvmeq, -1);
-
sent--;
if (nr_queues)
goto retry;
@@ -2445,6 +2496,7 @@
nvme_suspend_io_queues(dev);
nvme_suspend_queue(&dev->queues[0]);
nvme_pci_disable(dev);
+ nvme_reap_pending_cqes(dev);
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
@@ -2538,7 +2590,9 @@
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
int result;
- if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) {
+ if (dev->ctrl.state != NVME_CTRL_RESETTING) {
+ dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
+ dev->ctrl.state);
result = -ENODEV;
goto out;
}
@@ -2751,6 +2805,18 @@
(dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
return NVME_QUIRK_NO_APST;
+ } else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 ||
+ pdev->device == 0xa808 || pdev->device == 0xa809)) ||
+ (pdev->vendor == 0x1e0f && pdev->device == 0x0001)) {
+ /*
+ * Forcing to use host managed nvme power settings for
+ * lowest idle power with quick resume latency on
+ * Samsung and Toshiba SSDs based on suspend behavior
+ * on Coffee Lake board for LENOVO C640
+ */
+ if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
+ dmi_match(DMI_BOARD_NAME, "LNVNB161216"))
+ return NVME_QUIRK_SIMPLE_SUSPEND;
}
return 0;
@@ -2780,8 +2846,11 @@
if (!dev)
return -ENOMEM;
- dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
- GFP_KERNEL, node);
+ dev->nr_write_queues = write_queues;
+ dev->nr_poll_queues = poll_queues;
+ dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
+ dev->queues = kcalloc_node(dev->nr_allocated_queues,
+ sizeof(struct nvme_queue), GFP_KERNEL, node);
if (!dev->queues)
goto free;
@@ -2827,7 +2896,6 @@
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
nvme_reset_ctrl(&dev->ctrl);
- nvme_get_ctrl(&dev->ctrl);
async_schedule(nvme_async_probe, dev);
return 0;
@@ -2888,7 +2956,6 @@
if (!pci_device_is_present(pdev)) {
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
nvme_dev_disable(dev, true);
- nvme_dev_remove_admin(dev);
}
flush_work(&dev->ctrl.reset_work);
@@ -2948,9 +3015,15 @@
* the PCI bus layer to put it into D3 in order to take the PCIe link
* down, so as to allow the platform to achieve its minimum low-power
* state (which may not be possible if the link is up).
+ *
+ * If a host memory buffer is enabled, shut down the device as the NVMe
+ * specification allows the device to access the host memory buffer in
+ * host DRAM from all power states, but hosts will fail access to DRAM
+ * during S3.
*/
if (pm_suspend_via_firmware() || !ctrl->npss ||
!pcie_aspm_enabled(pdev) ||
+ ndev->nr_host_mem_descs ||
(ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
return nvme_disable_prepare_reset(ndev, true);
@@ -3082,14 +3155,18 @@
NVME_QUIRK_DEALLOCATE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0xf1a5), /* Intel 600P/P3100 */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
- NVME_QUIRK_MEDIUM_PRIO_SQ },
+ NVME_QUIRK_MEDIUM_PRIO_SQ |
+ NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_VDEVICE(INTEL, 0xf1a6), /* Intel 760p/Pro 7600p */
.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */
.driver_data = NVME_QUIRK_IDENTIFY_CNS |
NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+ { PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */
+ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST, },
{ PCI_DEVICE(0x1bb1, 0x0100), /* Seagate Nytro Flash Storage */
- .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
+ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
+ NVME_QUIRK_NO_NS_DESC_LIST, },
{ PCI_DEVICE(0x1c58, 0x0003), /* HGST adapter */
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
{ PCI_DEVICE(0x1c58, 0x0023), /* WDC SN200 adapter */
@@ -3099,7 +3176,14 @@
{ PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */
.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
{ PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */
- .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
+ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
+ NVME_QUIRK_DISABLE_WRITE_ZEROES|
+ NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ { PCI_DEVICE(0x1987, 0x5016), /* Phison E16 */
+ .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ { PCI_DEVICE(0x1b4b, 0x1092), /* Lexar 256 GB SSD */
+ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
+ NVME_QUIRK_IGNORE_DEV_SUBNQN, },
{ PCI_DEVICE(0x1d1d, 0x1f1f), /* LighNVM qemu device */
.driver_data = NVME_QUIRK_LIGHTNVM, },
{ PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */
@@ -3111,8 +3195,13 @@
{ PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ { PCI_DEVICE(0x1c5c, 0x1504), /* SK Hynix PC400 */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
- { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
+ { PCI_DEVICE(0x2646, 0x2263), /* KINGSTON A2000 NVMe SSD */
+ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
+ { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
+ .driver_data = NVME_QUIRK_SINGLE_VECTOR },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
.driver_data = NVME_QUIRK_SINGLE_VECTOR |
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index cb4c300..dcc3d23 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -451,7 +451,7 @@
* Spread I/O queues completion vectors according their queue index.
* Admin queues can always go on completion vector 0.
*/
- comp_vector = idx == 0 ? idx : idx - 1;
+ comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
/* Polling queues need direct cq polling context */
if (nvme_rdma_poll_queue(queue))
@@ -665,10 +665,13 @@
if (ret)
return ret;
- ctrl->ctrl.queue_count = nr_io_queues + 1;
- if (ctrl->ctrl.queue_count < 2)
- return 0;
+ if (nr_io_queues == 0) {
+ dev_err(ctrl->ctrl.device,
+ "unable to set any I/O queues\n");
+ return -ENOMEM;
+ }
+ ctrl->ctrl.queue_count = nr_io_queues + 1;
dev_info(ctrl->ctrl.device,
"creating %d I/O queues.\n", nr_io_queues);
@@ -768,6 +771,7 @@
blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
}
if (ctrl->async_event_sqe.data) {
+ cancel_work_sync(&ctrl->ctrl.async_event_work);
nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
sizeof(struct nvme_command), DMA_TO_DEVICE);
ctrl->async_event_sqe.data = NULL;
@@ -834,12 +838,16 @@
error = nvme_init_identify(&ctrl->ctrl);
if (error)
- goto out_stop_queue;
+ goto out_quiesce_queue;
return 0;
+out_quiesce_queue:
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+ blk_sync_queue(ctrl->ctrl.admin_q);
out_stop_queue:
nvme_rdma_stop_queue(&ctrl->queues[0]);
+ nvme_cancel_admin_tagset(&ctrl->ctrl);
out_cleanup_queue:
if (new)
blk_cleanup_queue(ctrl->ctrl.admin_q);
@@ -850,9 +858,11 @@
if (new)
blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
out_free_async_qe:
- nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
- sizeof(struct nvme_command), DMA_TO_DEVICE);
- ctrl->async_event_sqe.data = NULL;
+ if (ctrl->async_event_sqe.data) {
+ nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
+ sizeof(struct nvme_command), DMA_TO_DEVICE);
+ ctrl->async_event_sqe.data = NULL;
+ }
out_free_queue:
nvme_rdma_free_queue(&ctrl->queues[0]);
return error;
@@ -888,18 +898,36 @@
ret = PTR_ERR(ctrl->ctrl.connect_q);
goto out_free_tag_set;
}
- } else {
- blk_mq_update_nr_hw_queues(&ctrl->tag_set,
- ctrl->ctrl.queue_count - 1);
}
ret = nvme_rdma_start_io_queues(ctrl);
if (ret)
goto out_cleanup_connect_q;
+ if (!new) {
+ nvme_start_queues(&ctrl->ctrl);
+ if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
+ /*
+ * If we timed out waiting for freeze we are likely to
+ * be stuck. Fail the controller initialization just
+ * to be safe.
+ */
+ ret = -ENODEV;
+ goto out_wait_freeze_timed_out;
+ }
+ blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
+ ctrl->ctrl.queue_count - 1);
+ nvme_unfreeze(&ctrl->ctrl);
+ }
+
return 0;
+out_wait_freeze_timed_out:
+ nvme_stop_queues(&ctrl->ctrl);
+ nvme_sync_io_queues(&ctrl->ctrl);
+ nvme_rdma_stop_io_queues(ctrl);
out_cleanup_connect_q:
+ nvme_cancel_tagset(&ctrl->ctrl);
if (new)
blk_cleanup_queue(ctrl->ctrl.connect_q);
out_free_tag_set:
@@ -914,6 +942,7 @@
bool remove)
{
blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+ blk_sync_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
if (ctrl->ctrl.admin_tagset) {
blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset,
@@ -929,7 +958,9 @@
bool remove)
{
if (ctrl->ctrl.queue_count > 1) {
+ nvme_start_freeze(&ctrl->ctrl);
nvme_stop_queues(&ctrl->ctrl);
+ nvme_sync_io_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
if (ctrl->ctrl.tagset) {
blk_mq_tagset_busy_iter(ctrl->ctrl.tagset,
@@ -1032,10 +1063,18 @@
return 0;
destroy_io:
- if (ctrl->ctrl.queue_count > 1)
+ if (ctrl->ctrl.queue_count > 1) {
+ nvme_stop_queues(&ctrl->ctrl);
+ nvme_sync_io_queues(&ctrl->ctrl);
+ nvme_rdma_stop_io_queues(ctrl);
+ nvme_cancel_tagset(&ctrl->ctrl);
nvme_rdma_destroy_io_queues(ctrl, new);
+ }
destroy_admin:
+ blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
+ blk_sync_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
+ nvme_cancel_admin_tagset(&ctrl->ctrl);
nvme_rdma_destroy_admin_queue(ctrl, new);
return ret;
}
@@ -1088,7 +1127,8 @@
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
return;
- queue_work(nvme_wq, &ctrl->err_work);
+ dev_warn(ctrl->ctrl.device, "starting error recovery\n");
+ queue_work(nvme_reset_wq, &ctrl->err_work);
}
static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
@@ -1494,6 +1534,14 @@
return;
}
+ /* sanity checking for received data length */
+ if (unlikely(wc->byte_len < len)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "Unexpected nvme completion length(%d)\n", wc->byte_len);
+ nvme_rdma_error_recovery(queue->ctrl);
+ return;
+ }
+
ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
/*
* AEN requests are special as they don't time out and can
@@ -1653,7 +1701,6 @@
complete(&queue->cm_done);
return 0;
case RDMA_CM_EVENT_REJECTED:
- nvme_rdma_destroy_queue_ib(queue);
cm_error = nvme_rdma_conn_rejected(queue, ev);
break;
case RDMA_CM_EVENT_ROUTE_ERROR:
@@ -1691,6 +1738,18 @@
return 0;
}
+static void nvme_rdma_complete_timed_out(struct request *rq)
+{
+ struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_rdma_queue *queue = req->queue;
+
+ nvme_rdma_stop_queue(queue);
+ if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) {
+ nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
+ blk_mq_complete_request(rq);
+ }
+}
+
static enum blk_eh_timer_return
nvme_rdma_timeout(struct request *rq, bool reserved)
{
@@ -1701,29 +1760,29 @@
dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
rq->tag, nvme_rdma_queue_idx(queue));
- /*
- * Restart the timer if a controller reset is already scheduled. Any
- * timed out commands would be handled before entering the connecting
- * state.
- */
- if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
- return BLK_EH_RESET_TIMER;
-
if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
/*
- * Teardown immediately if controller times out while starting
- * or we are already started error recovery. all outstanding
- * requests are completed on shutdown, so we return BLK_EH_DONE.
+ * If we are resetting, connecting or deleting we should
+ * complete immediately because we may block controller
+ * teardown or setup sequence
+ * - ctrl disable/shutdown fabrics requests
+ * - connect requests
+ * - initialization admin requests
+ * - I/O requests that entered after unquiescing and
+ * the controller stopped responding
+ *
+ * All other requests should be cancelled by the error
+ * recovery work, so it's fine that we fail it here.
*/
- flush_work(&ctrl->err_work);
- nvme_rdma_teardown_io_queues(ctrl, false);
- nvme_rdma_teardown_admin_queue(ctrl, false);
+ nvme_rdma_complete_timed_out(rq);
return BLK_EH_DONE;
}
- dev_warn(ctrl->ctrl.device, "starting error recovery\n");
+ /*
+ * LIVE state should trigger the normal error recovery which will
+ * handle completing this request.
+ */
nvme_rdma_error_recovery(ctrl);
-
return BLK_EH_RESET_TIMER;
}
@@ -2045,8 +2104,6 @@
dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
- nvme_get_ctrl(&ctrl->ctrl);
-
mutex_lock(&nvme_rdma_ctrl_mutex);
list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
mutex_unlock(&nvme_rdma_ctrl_mutex);
@@ -2056,6 +2113,7 @@
out_uninit_ctrl:
nvme_uninit_ctrl(&ctrl->ctrl);
nvme_put_ctrl(&ctrl->ctrl);
+ nvme_put_ctrl(&ctrl->ctrl);
if (ret > 0)
ret = -EIO;
return ERR_PTR(ret);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 7544be8..38bbbbb 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -164,16 +164,14 @@
static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
{
struct request *rq;
- unsigned int bytes;
if (unlikely(nvme_tcp_async_req(req)))
return false; /* async events don't have a request */
rq = blk_mq_rq_from_pdu(req);
- bytes = blk_rq_payload_bytes(rq);
- return rq_data_dir(rq) == WRITE && bytes &&
- bytes <= nvme_tcp_inline_data_size(req->queue);
+ return rq_data_dir(rq) == WRITE && req->data_len &&
+ req->data_len <= nvme_tcp_inline_data_size(req->queue);
}
static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
@@ -188,7 +186,7 @@
static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
{
- return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
+ return min_t(size_t, iov_iter_single_seg_count(&req->iter),
req->pdu_len - req->pdu_sent);
}
@@ -422,7 +420,8 @@
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
return;
- queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->err_work);
+ dev_warn(ctrl->device, "starting error recovery\n");
+ queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
}
static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
@@ -513,6 +512,13 @@
req->pdu_len = le32_to_cpu(pdu->r2t_length);
req->pdu_sent = 0;
+ if (unlikely(!req->pdu_len)) {
+ dev_err(queue->ctrl->ctrl.device,
+ "req %d r2t len is %u, probably a bug...\n",
+ rq->tag, req->pdu_len);
+ return -EPROTO;
+ }
+
if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
dev_err(queue->ctrl->ctrl.device,
"req %d r2t len %u exceeded data len %u (%zu sent)\n",
@@ -636,17 +642,9 @@
unsigned int *offset, size_t *len)
{
struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
- struct nvme_tcp_request *req;
- struct request *rq;
-
- rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
- if (!rq) {
- dev_err(queue->ctrl->ctrl.device,
- "queue %d tag %#x not found\n",
- nvme_tcp_queue_id(queue), pdu->command_id);
- return -ENOENT;
- }
- req = blk_mq_rq_to_pdu(rq);
+ struct request *rq =
+ blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
while (true) {
int recv_len, ret;
@@ -786,11 +784,11 @@
{
struct nvme_tcp_queue *queue;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
queue = sk->sk_user_data;
if (likely(queue && queue->rd_enabled))
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
static void nvme_tcp_write_space(struct sock *sk)
@@ -810,7 +808,7 @@
{
struct nvme_tcp_queue *queue;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
queue = sk->sk_user_data;
if (!queue)
goto done;
@@ -832,7 +830,7 @@
queue->state_change(sk);
done:
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
@@ -861,12 +859,11 @@
else
flags |= MSG_MORE;
- /* can't zcopy slab pages */
- if (unlikely(PageSlab(page))) {
- ret = sock_no_sendpage(queue->sock, page, offset, len,
+ if (sendpage_ok(page)) {
+ ret = kernel_sendpage(queue->sock, page, offset, len,
flags);
} else {
- ret = kernel_sendpage(queue->sock, page, offset, len,
+ ret = sock_no_sendpage(queue->sock, page, offset, len,
flags);
}
if (ret <= 0)
@@ -1054,7 +1051,12 @@
} else if (unlikely(result < 0)) {
dev_err(queue->ctrl->ctrl.device,
"failed to send request %d\n", result);
- if (result != -EPIPE)
+
+ /*
+ * Fail the request unless peer closed the connection,
+ * in which case error recovery flow will complete all.
+ */
+ if ((result != -EPIPE) && (result != -ECONNRESET))
nvme_tcp_fail_request(queue->request);
nvme_tcp_done_send_req(queue);
return;
@@ -1316,6 +1318,9 @@
}
}
+ /* Set 10 seconds timeout for icresp recvmsg */
+ queue->sock->sk->sk_rcvtimeo = 10 * HZ;
+
queue->sock->sk->sk_allocation = GFP_ATOMIC;
if (!qid)
n = 0;
@@ -1432,7 +1437,6 @@
if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
return;
-
__nvme_tcp_stop_queue(queue);
}
@@ -1500,6 +1504,7 @@
static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
{
if (to_tcp_ctrl(ctrl)->async_req.pdu) {
+ cancel_work_sync(&ctrl->async_event_work);
nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
}
@@ -1636,10 +1641,13 @@
if (ret)
return ret;
- ctrl->queue_count = nr_io_queues + 1;
- if (ctrl->queue_count < 2)
- return 0;
+ if (nr_io_queues == 0) {
+ dev_err(ctrl->device,
+ "unable to set any I/O queues\n");
+ return -ENOMEM;
+ }
+ ctrl->queue_count = nr_io_queues + 1;
dev_info(ctrl->device,
"creating %d I/O queues.\n", nr_io_queues);
@@ -1678,18 +1686,36 @@
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
}
- } else {
- blk_mq_update_nr_hw_queues(ctrl->tagset,
- ctrl->queue_count - 1);
}
ret = nvme_tcp_start_io_queues(ctrl);
if (ret)
goto out_cleanup_connect_q;
+ if (!new) {
+ nvme_start_queues(ctrl);
+ if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
+ /*
+ * If we timed out waiting for freeze we are likely to
+ * be stuck. Fail the controller initialization just
+ * to be safe.
+ */
+ ret = -ENODEV;
+ goto out_wait_freeze_timed_out;
+ }
+ blk_mq_update_nr_hw_queues(ctrl->tagset,
+ ctrl->queue_count - 1);
+ nvme_unfreeze(ctrl);
+ }
+
return 0;
+out_wait_freeze_timed_out:
+ nvme_stop_queues(ctrl);
+ nvme_sync_io_queues(ctrl);
+ nvme_tcp_stop_io_queues(ctrl);
out_cleanup_connect_q:
+ nvme_cancel_tagset(ctrl);
if (new)
blk_cleanup_queue(ctrl->connect_q);
out_free_tag_set:
@@ -1751,12 +1777,16 @@
error = nvme_init_identify(ctrl);
if (error)
- goto out_stop_queue;
+ goto out_quiesce_queue;
return 0;
+out_quiesce_queue:
+ blk_mq_quiesce_queue(ctrl->admin_q);
+ blk_sync_queue(ctrl->admin_q);
out_stop_queue:
nvme_tcp_stop_queue(ctrl, 0);
+ nvme_cancel_admin_tagset(ctrl);
out_cleanup_queue:
if (new)
blk_cleanup_queue(ctrl->admin_q);
@@ -1775,6 +1805,7 @@
bool remove)
{
blk_mq_quiesce_queue(ctrl->admin_q);
+ blk_sync_queue(ctrl->admin_q);
nvme_tcp_stop_queue(ctrl, 0);
if (ctrl->admin_tagset) {
blk_mq_tagset_busy_iter(ctrl->admin_tagset,
@@ -1791,7 +1822,10 @@
{
if (ctrl->queue_count <= 1)
return;
+ blk_mq_quiesce_queue(ctrl->admin_q);
+ nvme_start_freeze(ctrl);
nvme_stop_queues(ctrl);
+ nvme_sync_io_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
if (ctrl->tagset) {
blk_mq_tagset_busy_iter(ctrl->tagset,
@@ -1866,10 +1900,18 @@
return 0;
destroy_io:
- if (ctrl->queue_count > 1)
+ if (ctrl->queue_count > 1) {
+ nvme_stop_queues(ctrl);
+ nvme_sync_io_queues(ctrl);
+ nvme_tcp_stop_io_queues(ctrl);
+ nvme_cancel_tagset(ctrl);
nvme_tcp_destroy_io_queues(ctrl, new);
+ }
destroy_admin:
+ blk_mq_quiesce_queue(ctrl->admin_q);
+ blk_sync_queue(ctrl->admin_q);
nvme_tcp_stop_queue(ctrl, 0);
+ nvme_cancel_admin_tagset(ctrl);
nvme_tcp_destroy_admin_queue(ctrl, new);
return ret;
}
@@ -2039,40 +2081,52 @@
nvme_tcp_queue_request(&ctrl->async_req);
}
+static void nvme_tcp_complete_timed_out(struct request *rq)
+{
+ struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+ struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
+
+ nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
+ if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) {
+ nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
+ blk_mq_complete_request(rq);
+ }
+}
+
static enum blk_eh_timer_return
nvme_tcp_timeout(struct request *rq, bool reserved)
{
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
- struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
+ struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
struct nvme_tcp_cmd_pdu *pdu = req->pdu;
- /*
- * Restart the timer if a controller reset is already scheduled. Any
- * timed out commands would be handled before entering the connecting
- * state.
- */
- if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
- return BLK_EH_RESET_TIMER;
-
- dev_warn(ctrl->ctrl.device,
+ dev_warn(ctrl->device,
"queue %d: timeout request %#x type %d\n",
nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
- if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
+ if (ctrl->state != NVME_CTRL_LIVE) {
/*
- * Teardown immediately if controller times out while starting
- * or we are already started error recovery. all outstanding
- * requests are completed on shutdown, so we return BLK_EH_DONE.
+ * If we are resetting, connecting or deleting we should
+ * complete immediately because we may block controller
+ * teardown or setup sequence
+ * - ctrl disable/shutdown fabrics requests
+ * - connect requests
+ * - initialization admin requests
+ * - I/O requests that entered after unquiescing and
+ * the controller stopped responding
+ *
+ * All other requests should be cancelled by the error
+ * recovery work, so it's fine that we fail it here.
*/
- flush_work(&ctrl->err_work);
- nvme_tcp_teardown_io_queues(&ctrl->ctrl, false);
- nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false);
+ nvme_tcp_complete_timed_out(rq);
return BLK_EH_DONE;
}
- dev_warn(ctrl->ctrl.device, "starting error recovery\n");
- nvme_tcp_error_recovery(&ctrl->ctrl);
-
+ /*
+ * LIVE state should trigger the normal error recovery which will
+ * handle completing this request.
+ */
+ nvme_tcp_error_recovery(ctrl);
return BLK_EH_RESET_TIMER;
}
@@ -2085,7 +2139,9 @@
c->common.flags |= NVME_CMD_SGL_METABUF;
- if (rq_data_dir(rq) == WRITE && req->data_len &&
+ if (!blk_rq_nr_phys_segments(rq))
+ nvme_tcp_set_sg_null(c);
+ else if (rq_data_dir(rq) == WRITE &&
req->data_len <= nvme_tcp_inline_data_size(queue))
nvme_tcp_set_sg_inline(queue, c, req->data_len);
else
@@ -2112,7 +2168,8 @@
req->data_sent = 0;
req->pdu_len = 0;
req->pdu_sent = 0;
- req->data_len = blk_rq_payload_bytes(rq);
+ req->data_len = blk_rq_nr_phys_segments(rq) ?
+ blk_rq_payload_bytes(rq) : 0;
req->curr_bio = rq->bio;
if (rq_data_dir(rq) == WRITE &&
@@ -2354,8 +2411,6 @@
dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
- nvme_get_ctrl(&ctrl->ctrl);
-
mutex_lock(&nvme_tcp_ctrl_mutex);
list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
mutex_unlock(&nvme_tcp_ctrl_mutex);
@@ -2365,6 +2420,7 @@
out_uninit_ctrl:
nvme_uninit_ctrl(&ctrl->ctrl);
nvme_put_ctrl(&ctrl->ctrl);
+ nvme_put_ctrl(&ctrl->ctrl);
if (ret > 0)
ret = -EIO;
return ERR_PTR(ret);
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index daaf700..35bac7a 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -56,7 +56,7 @@
__field(u8, fctype)
__field(u16, cid)
__field(u32, nsid)
- __field(u64, metadata)
+ __field(bool, metadata)
__array(u8, cdw10, 24)
),
TP_fast_assign(
@@ -66,13 +66,13 @@
__entry->flags = cmd->common.flags;
__entry->cid = cmd->common.command_id;
__entry->nsid = le32_to_cpu(cmd->common.nsid);
- __entry->metadata = le64_to_cpu(cmd->common.metadata);
+ __entry->metadata = !!blk_integrity_rq(req);
__entry->fctype = cmd->fabrics.fctype;
__assign_disk_name(__entry->disk, req->rq_disk);
memcpy(__entry->cdw10, &cmd->common.cdw10,
sizeof(__entry->cdw10));
),
- TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
+ TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%x, cmd=(%s %s)",
__entry->ctrl_id, __print_disk_name(__entry->disk),
__entry->qid, __entry->cid, __entry->nsid,
__entry->flags, __entry->metadata,
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 3a67e24..ee81d94 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -369,6 +369,9 @@
static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl)
{
+ if (unlikely(ctrl->kato == 0))
+ return;
+
pr_debug("ctrl %d start keep-alive timer for %d secs\n",
ctrl->cntlid, ctrl->kato);
@@ -378,6 +381,9 @@
static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
{
+ if (unlikely(ctrl->kato == 0))
+ return;
+
pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid);
cancel_delayed_work_sync(&ctrl->ka_work);
@@ -555,7 +561,8 @@
} else {
struct nvmet_ns *old;
- list_for_each_entry_rcu(old, &subsys->namespaces, dev_link) {
+ list_for_each_entry_rcu(old, &subsys->namespaces, dev_link,
+ lockdep_is_held(&subsys->lock)) {
BUG_ON(ns->nsid == old->nsid);
if (ns->nsid < old->nsid)
break;
@@ -871,8 +878,6 @@
req->error_loc = NVMET_NO_ERROR_LOC;
req->error_slba = 0;
- trace_nvmet_req_init(req, req->cmd);
-
/* no support for fused commands yet */
if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
req->error_loc = offsetof(struct nvme_common_command, flags);
@@ -906,6 +911,8 @@
if (status)
goto fail;
+ trace_nvmet_req_init(req, req->cmd);
+
if (unlikely(!percpu_ref_tryget_live(&sq->ref))) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
goto fail;
@@ -1024,9 +1031,20 @@
{
lockdep_assert_held(&ctrl->lock);
- if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
- nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES ||
- nvmet_cc_mps(ctrl->cc) != 0 ||
+ /*
+ * Only I/O controllers should verify iosqes,iocqes.
+ * Strictly speaking, the spec says a discovery controller
+ * should verify iosqes,iocqes are zeroed, however that
+ * would break backwards compatibility, so don't enforce it.
+ */
+ if (ctrl->subsys->type != NVME_NQN_DISC &&
+ (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES ||
+ nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES)) {
+ ctrl->csts = NVME_CSTS_CFS;
+ return;
+ }
+
+ if (nvmet_cc_mps(ctrl->cc) != 0 ||
nvmet_cc_ams(ctrl->cc) != 0 ||
nvmet_cc_css(ctrl->cc) != 0) {
ctrl->csts = NVME_CSTS_CFS;
@@ -1041,7 +1059,8 @@
* in case a host died before it enabled the controller. Hence, simply
* reset the keep alive timer when the controller is enabled.
*/
- mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
+ if (ctrl->kato)
+ mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
}
static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl)
@@ -1174,7 +1193,8 @@
ctrl->p2p_client = get_device(req->p2p_client);
- list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link)
+ list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link,
+ lockdep_is_held(&ctrl->subsys->lock))
nvmet_p2pmem_ns_add_p2p(ctrl, ns);
}
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index d16b55f..5e47395 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -105,6 +105,7 @@
u16 qid = le16_to_cpu(c->qid);
u16 sqsize = le16_to_cpu(c->sqsize);
struct nvmet_ctrl *old;
+ u16 ret;
old = cmpxchg(&req->sq->ctrl, NULL, ctrl);
if (old) {
@@ -115,7 +116,9 @@
if (!sqsize) {
pr_warn("queue size zero!\n");
req->error_loc = offsetof(struct nvmf_connect_command, sqsize);
- return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
+ req->cqe->result.u32 = IPO_IATTR_CONNECT_SQE(sqsize);
+ ret = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
+ goto err;
}
/* note: convert queue size from 0's-based value to 1's-based value */
@@ -128,16 +131,19 @@
}
if (ctrl->ops->install_queue) {
- u16 ret = ctrl->ops->install_queue(req->sq);
-
+ ret = ctrl->ops->install_queue(req->sq);
if (ret) {
pr_err("failed to install queue %d cntlid %d ret %x\n",
- qid, ret, ctrl->cntlid);
- return ret;
+ qid, ctrl->cntlid, ret);
+ goto err;
}
}
return 0;
+
+err:
+ req->sq->ctrl = NULL;
+ return ret;
}
static void nvmet_execute_admin_connect(struct nvmet_req *req)
@@ -245,11 +251,11 @@
}
status = nvmet_install_queue(ctrl, req);
- if (status) {
- /* pass back cntlid that had the issue of installing queue */
- req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
+ if (status)
goto out_ctrl_put;
- }
+
+ /* pass back cntlid for successful completion */
+ req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index ce8d819..9b07e8c 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -1994,9 +1994,9 @@
return;
if (fcpreq->fcp_error ||
fcpreq->transferred_length != fcpreq->transfer_length) {
- spin_lock(&fod->flock);
+ spin_lock_irqsave(&fod->flock, flags);
fod->abort = true;
- spin_unlock(&fod->flock);
+ spin_unlock_irqrestore(&fod->flock, flags);
nvmet_req_complete(&fod->req, NVME_SC_INTERNAL);
return;
@@ -2152,13 +2152,6 @@
int ret;
/*
- * if there is no nvmet mapping to the targetport there
- * shouldn't be requests. just terminate them.
- */
- if (!tgtport->pe)
- goto transport_error;
-
- /*
* Fused commands are currently not supported in the linux
* implementation.
*
@@ -2185,7 +2178,8 @@
fod->req.cmd = &fod->cmdiubuf.sqe;
fod->req.cqe = &fod->rspiubuf.cqe;
- fod->req.port = tgtport->pe->port;
+ if (tgtport->pe)
+ fod->req.port = tgtport->pe->port;
/* clear any response payload */
memset(&fod->rspiubuf, 0, sizeof(fod->rspiubuf));
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index 05453f5..6ca17a0 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -38,9 +38,11 @@
ns->file = filp_open(ns->device_path, flags, 0);
if (IS_ERR(ns->file)) {
- pr_err("failed to open file %s: (%ld)\n",
- ns->device_path, PTR_ERR(ns->file));
- return PTR_ERR(ns->file);
+ ret = PTR_ERR(ns->file);
+ pr_err("failed to open file %s: (%d)\n",
+ ns->device_path, ret);
+ ns->file = NULL;
+ return ret;
}
ret = vfs_getattr(&ns->file->f_path,
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 11f5aea..f657a12 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -252,7 +252,8 @@
static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
{
- clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
+ if (!test_and_clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags))
+ return;
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
blk_cleanup_queue(ctrl->ctrl.admin_q);
blk_cleanup_queue(ctrl->ctrl.fabrics_q);
@@ -288,6 +289,7 @@
clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags);
nvmet_sq_destroy(&ctrl->queues[i].nvme_sq);
}
+ ctrl->ctrl.queue_count = 1;
}
static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl)
@@ -394,6 +396,7 @@
return 0;
out_cleanup_queue:
+ clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
blk_cleanup_queue(ctrl->ctrl.admin_q);
out_cleanup_fabrics_q:
blk_cleanup_queue(ctrl->ctrl.fabrics_q);
@@ -619,8 +622,6 @@
dev_info(ctrl->ctrl.device,
"new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn);
- nvme_get_ctrl(&ctrl->ctrl);
-
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
@@ -638,6 +639,7 @@
kfree(ctrl->queues);
out_uninit_ctrl:
nvme_uninit_ctrl(&ctrl->ctrl);
+ nvme_put_ctrl(&ctrl->ctrl);
out_put_ctrl:
nvme_put_ctrl(&ctrl->ctrl);
if (ret > 0)
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 36d906a..50e2007 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -75,6 +75,7 @@
struct nvmet_rdma_queue {
struct rdma_cm_id *cm_id;
+ struct ib_qp *qp;
struct nvmet_port *port;
struct ib_cq *cq;
atomic_t sq_wr_avail;
@@ -464,7 +465,7 @@
if (ndev->srq)
ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL);
else
- ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL);
+ ret = ib_post_recv(cmd->queue->qp, &cmd->wr, NULL);
if (unlikely(ret))
pr_err("post_recv cmd failed\n");
@@ -503,7 +504,7 @@
atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
if (rsp->n_rdma) {
- rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
+ rdma_rw_ctx_destroy(&rsp->rw, queue->qp,
queue->cm_id->port_num, rsp->req.sg,
rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
}
@@ -587,7 +588,7 @@
WARN_ON(rsp->n_rdma <= 0);
atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
- rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
+ rdma_rw_ctx_destroy(&rsp->rw, queue->qp,
queue->cm_id->port_num, rsp->req.sg,
rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
rsp->n_rdma = 0;
@@ -742,7 +743,7 @@
}
if (nvmet_rdma_need_data_in(rsp)) {
- if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp,
+ if (rdma_rw_ctx_post(&rsp->rw, queue->qp,
queue->cm_id->port_num, &rsp->read_cqe, NULL))
nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
} else {
@@ -1025,6 +1026,7 @@
pr_err("failed to create_qp ret= %d\n", ret);
goto err_destroy_cq;
}
+ queue->qp = queue->cm_id->qp;
atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
@@ -1053,11 +1055,10 @@
static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
{
- struct ib_qp *qp = queue->cm_id->qp;
-
- ib_drain_qp(qp);
- rdma_destroy_id(queue->cm_id);
- ib_destroy_qp(qp);
+ ib_drain_qp(queue->qp);
+ if (queue->cm_id)
+ rdma_destroy_id(queue->cm_id);
+ ib_destroy_qp(queue->qp);
ib_free_cq(queue->cq);
}
@@ -1291,9 +1292,12 @@
ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
if (ret) {
- schedule_work(&queue->release_work);
- /* Destroying rdma_cm id is not needed here */
- return 0;
+ /*
+ * Don't destroy the cm_id in free path, as we implicitly
+ * destroy the cm_id here with non-zero ret code.
+ */
+ queue->cm_id = NULL;
+ goto free_queue;
}
mutex_lock(&nvmet_rdma_queue_mutex);
@@ -1302,6 +1306,8 @@
return 0;
+free_queue:
+ nvmet_rdma_free_queue(queue);
put_device:
kref_put(&ndev->ref, nvmet_rdma_free_dev);
@@ -1345,6 +1351,16 @@
spin_lock_irqsave(&queue->state_lock, flags);
switch (queue->state) {
case NVMET_RDMA_Q_CONNECTING:
+ while (!list_empty(&queue->rsp_wait_list)) {
+ struct nvmet_rdma_rsp *rsp;
+
+ rsp = list_first_entry(&queue->rsp_wait_list,
+ struct nvmet_rdma_rsp,
+ wait_list);
+ list_del(&rsp->wait_list);
+ nvmet_rdma_put_rsp(rsp);
+ }
+ fallthrough;
case NVMET_RDMA_Q_LIVE:
queue->state = NVMET_RDMA_Q_DISCONNECTING;
disconnect = true;
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index d535080..2ae8462 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -150,6 +150,11 @@
static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
struct nvmet_tcp_cmd *cmd)
{
+ if (unlikely(!queue->nr_cmds)) {
+ /* We didn't allocate cmds yet, send 0xffff */
+ return USHRT_MAX;
+ }
+
return cmd - queue->cmds;
}
@@ -287,7 +292,7 @@
length = cmd->pdu_len;
cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE);
offset = cmd->rbytes_done;
- cmd->sg_idx = DIV_ROUND_UP(offset, PAGE_SIZE);
+ cmd->sg_idx = offset / PAGE_SIZE;
sg_offset = offset % PAGE_SIZE;
sg = &cmd->req.sg[cmd->sg_idx];
@@ -300,6 +305,7 @@
length -= iov_len;
sg = sg_next(sg);
iov++;
+ sg_offset = 0;
}
iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
@@ -515,7 +521,7 @@
return 1;
}
-static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd)
+static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
{
struct nvmet_tcp_queue *queue = cmd->queue;
int ret;
@@ -523,9 +529,15 @@
while (cmd->cur_sg) {
struct page *page = sg_page(cmd->cur_sg);
u32 left = cmd->cur_sg->length - cmd->offset;
+ int flags = MSG_DONTWAIT;
+
+ if ((!last_in_batch && cmd->queue->send_list_len) ||
+ cmd->wbytes_done + left < cmd->req.transfer_len ||
+ queue->data_digest || !queue->nvme_sq.sqhd_disabled)
+ flags |= MSG_MORE;
ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
- left, MSG_DONTWAIT | MSG_MORE);
+ left, flags);
if (ret <= 0)
return ret;
@@ -660,7 +672,7 @@
}
if (cmd->state == NVMET_TCP_SEND_DATA) {
- ret = nvmet_try_send_data(cmd);
+ ret = nvmet_try_send_data(cmd, last_in_batch);
if (ret <= 0)
goto done_send;
}
@@ -788,7 +800,7 @@
icresp->hdr.pdo = 0;
icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
- icresp->maxdata = cpu_to_le32(0xffff); /* FIXME: support r2t */
+ icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */
icresp->cpda = 0;
if (queue->hdr_digest)
icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
@@ -841,7 +853,10 @@
struct nvme_tcp_data_pdu *data = &queue->pdu.data;
struct nvmet_tcp_cmd *cmd;
- cmd = &queue->cmds[data->ttag];
+ if (likely(queue->nr_cmds))
+ cmd = &queue->cmds[data->ttag];
+ else
+ cmd = &queue->connect;
if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
pr_err("ttag %u unexpected data offset %u (expected %u)\n",
@@ -1387,7 +1402,7 @@
{
struct nvmet_tcp_queue *queue;
- write_lock_bh(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
queue = sk->sk_user_data;
if (!queue)
goto done;
@@ -1397,7 +1412,6 @@
case TCP_CLOSE_WAIT:
case TCP_CLOSE:
/* FALLTHRU */
- sk->sk_user_data = NULL;
nvmet_tcp_schedule_release_queue(queue);
break;
default:
@@ -1405,7 +1419,7 @@
queue->idx, sk->sk_state);
}
done:
- write_unlock_bh(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
diff --git a/drivers/nvme/target/trace.h b/drivers/nvme/target/trace.h
index e645caa..3f61b66 100644
--- a/drivers/nvme/target/trace.h
+++ b/drivers/nvme/target/trace.h
@@ -46,19 +46,12 @@
return req->sq->ctrl;
}
-static inline void __assign_disk_name(char *name, struct nvmet_req *req,
- bool init)
+static inline void __assign_req_name(char *name, struct nvmet_req *req)
{
- struct nvmet_ctrl *ctrl = nvmet_req_to_ctrl(req);
- struct nvmet_ns *ns;
-
- if ((init && req->sq->qid) || (!init && req->cq->qid)) {
- ns = nvmet_find_namespace(ctrl, req->cmd->rw.nsid);
- strncpy(name, ns->device_path, DISK_NAME_LEN);
- return;
- }
-
- memset(name, 0, DISK_NAME_LEN);
+ if (req->ns)
+ strncpy(name, req->ns->device_path, DISK_NAME_LEN);
+ else
+ memset(name, 0, DISK_NAME_LEN);
}
#endif
@@ -81,7 +74,7 @@
TP_fast_assign(
__entry->cmd = cmd;
__entry->ctrl = nvmet_req_to_ctrl(req);
- __assign_disk_name(__entry->disk, req, true);
+ __assign_req_name(__entry->disk, req);
__entry->qid = req->sq->qid;
__entry->cid = cmd->common.command_id;
__entry->opcode = cmd->common.opcode;
@@ -121,7 +114,7 @@
__entry->cid = req->cqe->command_id;
__entry->result = le64_to_cpu(req->cqe->result.u64);
__entry->status = le16_to_cpu(req->cqe->status) >> 1;
- __assign_disk_name(__entry->disk, req, false);
+ __assign_req_name(__entry->disk, req);
),
TP_printk("nvmet%s: %sqid=%d, cmdid=%u, res=%#llx, status=%#x",
__print_ctrl_name(__entry->ctrl),