Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig
index 7b146b6..0653f4f 100644
--- a/drivers/infiniband/hw/hfi1/Kconfig
+++ b/drivers/infiniband/hw/hfi1/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config INFINIBAND_HFI1
tristate "Intel OPA Gen1 support"
depends on X86_64 && INFINIBAND_RDMAVT && I2C
diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile
index f451ba9..0405d26 100644
--- a/drivers/infiniband/hw/hfi1/Makefile
+++ b/drivers/infiniband/hw/hfi1/Makefile
@@ -8,12 +8,45 @@
#
obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o
-hfi1-y := affinity.o chip.o device.o driver.o efivar.o \
- eprom.o exp_rcv.o file_ops.o firmware.o \
- init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \
- qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \
- uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \
- verbs_txreq.o vnic_main.o vnic_sdma.o
+hfi1-y := \
+ affinity.o \
+ aspm.o \
+ chip.o \
+ device.o \
+ driver.o \
+ efivar.o \
+ eprom.o \
+ exp_rcv.o \
+ file_ops.o \
+ firmware.o \
+ init.o \
+ intr.o \
+ iowait.o \
+ mad.o \
+ mmu_rb.o \
+ msix.o \
+ opfn.o \
+ pcie.o \
+ pio.o \
+ pio_copy.o \
+ platform.o \
+ qp.o \
+ qsfp.o \
+ rc.o \
+ ruc.o \
+ sdma.o \
+ sysfs.o \
+ tid_rdma.o \
+ trace.o \
+ uc.o \
+ ud.o \
+ user_exp_rcv.o \
+ user_pages.o \
+ user_sdma.o \
+ verbs.o \
+ verbs_txreq.o \
+ vnic_main.o \
+ vnic_sdma.o
ifdef CONFIG_DEBUG_FS
hfi1-y += debugfs.o
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index bedd5fb..c142b23 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -48,6 +48,7 @@
#include <linux/cpumask.h>
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/numa.h>
#include "hfi.h"
#include "affinity.h"
@@ -777,7 +778,7 @@
_dev_comp_vect_cpu_mask_clean_up(dd, entry);
unlock:
mutex_unlock(&node_affinity.lock);
- dd->node = -1;
+ dd->node = NUMA_NO_NODE;
}
/*
@@ -817,10 +818,10 @@
set = &entry->def_intr;
cpumask_set_cpu(cpu, &set->mask);
cpumask_set_cpu(cpu, &set->used);
- for (i = 0; i < dd->num_msix_entries; i++) {
+ for (i = 0; i < dd->msix_info.max_requested; i++) {
struct hfi1_msix_entry *other_msix;
- other_msix = &dd->msix_entries[i];
+ other_msix = &dd->msix_info.msix_entries[i];
if (other_msix->type != IRQ_SDMA || other_msix == msix)
continue;
@@ -1037,7 +1038,7 @@
struct hfi1_affinity_node *entry;
cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
const struct cpumask *node_mask,
- *proc_mask = ¤t->cpus_allowed;
+ *proc_mask = current->cpus_ptr;
struct hfi1_affinity_node_list *affinity = &node_affinity;
struct cpu_mask_set *set = &affinity->proc;
@@ -1045,7 +1046,7 @@
* check whether process/context affinity has already
* been set
*/
- if (cpumask_weight(proc_mask) == 1) {
+ if (current->nr_cpus_allowed == 1) {
hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
current->pid, current->comm,
cpumask_pr_args(proc_mask));
@@ -1056,7 +1057,7 @@
cpu = cpumask_first(proc_mask);
cpumask_set_cpu(cpu, &set->used);
goto done;
- } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
+ } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
current->pid, current->comm,
cpumask_pr_args(proc_mask));
diff --git a/drivers/infiniband/hw/hfi1/aspm.c b/drivers/infiniband/hw/hfi1/aspm.c
new file mode 100644
index 0000000..a3c53be
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/aspm.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2019 Intel Corporation.
+ *
+ */
+
+#include "aspm.h"
+
+/* Time after which the timer interrupt will re-enable ASPM */
+#define ASPM_TIMER_MS 1000
+/* Time for which interrupts are ignored after a timer has been scheduled */
+#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2)
+/* Two interrupts within this time trigger ASPM disable */
+#define ASPM_TRIGGER_MS 1
+#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull)
+#define ASPM_L1_SUPPORTED(reg) \
+ ((((reg) & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2)
+
+uint aspm_mode = ASPM_MODE_DISABLED;
+module_param_named(aspm, aspm_mode, uint, 0444);
+MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
+
+static bool aspm_hw_l1_supported(struct hfi1_devdata *dd)
+{
+ struct pci_dev *parent = dd->pcidev->bus->self;
+ u32 up, dn;
+
+ /*
+ * If the driver does not have access to the upstream component,
+ * it cannot support ASPM L1 at all.
+ */
+ if (!parent)
+ return false;
+
+ pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn);
+ dn = ASPM_L1_SUPPORTED(dn);
+
+ pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up);
+ up = ASPM_L1_SUPPORTED(up);
+
+ /* ASPM works on A-step but is reported as not supported */
+ return (!!dn || is_ax(dd)) && !!up;
+}
+
+/* Set L1 entrance latency for slower entry to L1 */
+static void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd)
+{
+ u32 l1_ent_lat = 0x4u;
+ u32 reg32;
+
+ pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, ®32);
+ reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK;
+ reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT;
+ pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32);
+}
+
+static void aspm_hw_enable_l1(struct hfi1_devdata *dd)
+{
+ struct pci_dev *parent = dd->pcidev->bus->self;
+
+ /*
+ * If the driver does not have access to the upstream component,
+ * it cannot support ASPM L1 at all.
+ */
+ if (!parent)
+ return;
+
+ /* Enable ASPM L1 first in upstream component and then downstream */
+ pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
+ PCI_EXP_LNKCTL_ASPMC,
+ PCI_EXP_LNKCTL_ASPM_L1);
+ pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
+ PCI_EXP_LNKCTL_ASPMC,
+ PCI_EXP_LNKCTL_ASPM_L1);
+}
+
+void aspm_hw_disable_l1(struct hfi1_devdata *dd)
+{
+ struct pci_dev *parent = dd->pcidev->bus->self;
+
+ /* Disable ASPM L1 first in downstream component and then upstream */
+ pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
+ PCI_EXP_LNKCTL_ASPMC, 0x0);
+ if (parent)
+ pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
+ PCI_EXP_LNKCTL_ASPMC, 0x0);
+}
+
+static void aspm_enable(struct hfi1_devdata *dd)
+{
+ if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED ||
+ !dd->aspm_supported)
+ return;
+
+ aspm_hw_enable_l1(dd);
+ dd->aspm_enabled = true;
+}
+
+static void aspm_disable(struct hfi1_devdata *dd)
+{
+ if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED)
+ return;
+
+ aspm_hw_disable_l1(dd);
+ dd->aspm_enabled = false;
+}
+
+static void aspm_disable_inc(struct hfi1_devdata *dd)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->aspm_lock, flags);
+ aspm_disable(dd);
+ atomic_inc(&dd->aspm_disabled_cnt);
+ spin_unlock_irqrestore(&dd->aspm_lock, flags);
+}
+
+static void aspm_enable_dec(struct hfi1_devdata *dd)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->aspm_lock, flags);
+ if (atomic_dec_and_test(&dd->aspm_disabled_cnt))
+ aspm_enable(dd);
+ spin_unlock_irqrestore(&dd->aspm_lock, flags);
+}
+
+/* ASPM processing for each receive context interrupt */
+void __aspm_ctx_disable(struct hfi1_ctxtdata *rcd)
+{
+ bool restart_timer;
+ bool close_interrupts;
+ unsigned long flags;
+ ktime_t now, prev;
+
+ spin_lock_irqsave(&rcd->aspm_lock, flags);
+ /* PSM contexts are open */
+ if (!rcd->aspm_intr_enable)
+ goto unlock;
+
+ prev = rcd->aspm_ts_last_intr;
+ now = ktime_get();
+ rcd->aspm_ts_last_intr = now;
+
+ /* An interrupt pair close together in time */
+ close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS;
+
+ /* Don't push out our timer till this much time has elapsed */
+ restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) >
+ ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC;
+ restart_timer = restart_timer && close_interrupts;
+
+ /* Disable ASPM and schedule timer */
+ if (rcd->aspm_enabled && close_interrupts) {
+ aspm_disable_inc(rcd->dd);
+ rcd->aspm_enabled = false;
+ restart_timer = true;
+ }
+
+ if (restart_timer) {
+ mod_timer(&rcd->aspm_timer,
+ jiffies + msecs_to_jiffies(ASPM_TIMER_MS));
+ rcd->aspm_ts_timer_sched = now;
+ }
+unlock:
+ spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+}
+
+/* Timer function for re-enabling ASPM in the absence of interrupt activity */
+static void aspm_ctx_timer_function(struct timer_list *t)
+{
+ struct hfi1_ctxtdata *rcd = from_timer(rcd, t, aspm_timer);
+ unsigned long flags;
+
+ spin_lock_irqsave(&rcd->aspm_lock, flags);
+ aspm_enable_dec(rcd->dd);
+ rcd->aspm_enabled = true;
+ spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+}
+
+/*
+ * Disable interrupt processing for verbs contexts when PSM or VNIC contexts
+ * are open.
+ */
+void aspm_disable_all(struct hfi1_devdata *dd)
+{
+ struct hfi1_ctxtdata *rcd;
+ unsigned long flags;
+ u16 i;
+
+ for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
+ rcd = hfi1_rcd_get_by_index(dd, i);
+ if (rcd) {
+ del_timer_sync(&rcd->aspm_timer);
+ spin_lock_irqsave(&rcd->aspm_lock, flags);
+ rcd->aspm_intr_enable = false;
+ spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+ hfi1_rcd_put(rcd);
+ }
+ }
+
+ aspm_disable(dd);
+ atomic_set(&dd->aspm_disabled_cnt, 0);
+}
+
+/* Re-enable interrupt processing for verbs contexts */
+void aspm_enable_all(struct hfi1_devdata *dd)
+{
+ struct hfi1_ctxtdata *rcd;
+ unsigned long flags;
+ u16 i;
+
+ aspm_enable(dd);
+
+ if (aspm_mode != ASPM_MODE_DYNAMIC)
+ return;
+
+ for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
+ rcd = hfi1_rcd_get_by_index(dd, i);
+ if (rcd) {
+ spin_lock_irqsave(&rcd->aspm_lock, flags);
+ rcd->aspm_intr_enable = true;
+ rcd->aspm_enabled = true;
+ spin_unlock_irqrestore(&rcd->aspm_lock, flags);
+ hfi1_rcd_put(rcd);
+ }
+ }
+}
+
+static void aspm_ctx_init(struct hfi1_ctxtdata *rcd)
+{
+ spin_lock_init(&rcd->aspm_lock);
+ timer_setup(&rcd->aspm_timer, aspm_ctx_timer_function, 0);
+ rcd->aspm_intr_supported = rcd->dd->aspm_supported &&
+ aspm_mode == ASPM_MODE_DYNAMIC &&
+ rcd->ctxt < rcd->dd->first_dyn_alloc_ctxt;
+}
+
+void aspm_init(struct hfi1_devdata *dd)
+{
+ struct hfi1_ctxtdata *rcd;
+ u16 i;
+
+ spin_lock_init(&dd->aspm_lock);
+ dd->aspm_supported = aspm_hw_l1_supported(dd);
+
+ for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
+ rcd = hfi1_rcd_get_by_index(dd, i);
+ if (rcd)
+ aspm_ctx_init(rcd);
+ hfi1_rcd_put(rcd);
+ }
+
+ /* Start with ASPM disabled */
+ aspm_hw_set_l1_ent_latency(dd);
+ dd->aspm_enabled = false;
+ aspm_hw_disable_l1(dd);
+
+ /* Now turn on ASPM if configured */
+ aspm_enable_all(dd);
+}
+
+void aspm_exit(struct hfi1_devdata *dd)
+{
+ aspm_disable_all(dd);
+
+ /* Turn on ASPM on exit to conserve power */
+ aspm_enable(dd);
+}
+
diff --git a/drivers/infiniband/hw/hfi1/aspm.h b/drivers/infiniband/hw/hfi1/aspm.h
index e813387..75d5d18 100644
--- a/drivers/infiniband/hw/hfi1/aspm.h
+++ b/drivers/infiniband/hw/hfi1/aspm.h
@@ -57,266 +57,20 @@
ASPM_MODE_DYNAMIC = 2, /* ASPM enabled/disabled dynamically */
};
-/* Time after which the timer interrupt will re-enable ASPM */
-#define ASPM_TIMER_MS 1000
-/* Time for which interrupts are ignored after a timer has been scheduled */
-#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2)
-/* Two interrupts within this time trigger ASPM disable */
-#define ASPM_TRIGGER_MS 1
-#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull)
-#define ASPM_L1_SUPPORTED(reg) \
- (((reg & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2)
+void aspm_init(struct hfi1_devdata *dd);
+void aspm_exit(struct hfi1_devdata *dd);
+void aspm_hw_disable_l1(struct hfi1_devdata *dd);
+void __aspm_ctx_disable(struct hfi1_ctxtdata *rcd);
+void aspm_disable_all(struct hfi1_devdata *dd);
+void aspm_enable_all(struct hfi1_devdata *dd);
-static inline bool aspm_hw_l1_supported(struct hfi1_devdata *dd)
-{
- struct pci_dev *parent = dd->pcidev->bus->self;
- u32 up, dn;
-
- /*
- * If the driver does not have access to the upstream component,
- * it cannot support ASPM L1 at all.
- */
- if (!parent)
- return false;
-
- pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn);
- dn = ASPM_L1_SUPPORTED(dn);
-
- pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up);
- up = ASPM_L1_SUPPORTED(up);
-
- /* ASPM works on A-step but is reported as not supported */
- return (!!dn || is_ax(dd)) && !!up;
-}
-
-/* Set L1 entrance latency for slower entry to L1 */
-static inline void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd)
-{
- u32 l1_ent_lat = 0x4u;
- u32 reg32;
-
- pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, ®32);
- reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK;
- reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT;
- pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32);
-}
-
-static inline void aspm_hw_enable_l1(struct hfi1_devdata *dd)
-{
- struct pci_dev *parent = dd->pcidev->bus->self;
-
- /*
- * If the driver does not have access to the upstream component,
- * it cannot support ASPM L1 at all.
- */
- if (!parent)
- return;
-
- /* Enable ASPM L1 first in upstream component and then downstream */
- pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
- PCI_EXP_LNKCTL_ASPMC,
- PCI_EXP_LNKCTL_ASPM_L1);
- pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
- PCI_EXP_LNKCTL_ASPMC,
- PCI_EXP_LNKCTL_ASPM_L1);
-}
-
-static inline void aspm_hw_disable_l1(struct hfi1_devdata *dd)
-{
- struct pci_dev *parent = dd->pcidev->bus->self;
-
- /* Disable ASPM L1 first in downstream component and then upstream */
- pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL,
- PCI_EXP_LNKCTL_ASPMC, 0x0);
- if (parent)
- pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL,
- PCI_EXP_LNKCTL_ASPMC, 0x0);
-}
-
-static inline void aspm_enable(struct hfi1_devdata *dd)
-{
- if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED ||
- !dd->aspm_supported)
- return;
-
- aspm_hw_enable_l1(dd);
- dd->aspm_enabled = true;
-}
-
-static inline void aspm_disable(struct hfi1_devdata *dd)
-{
- if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED)
- return;
-
- aspm_hw_disable_l1(dd);
- dd->aspm_enabled = false;
-}
-
-static inline void aspm_disable_inc(struct hfi1_devdata *dd)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&dd->aspm_lock, flags);
- aspm_disable(dd);
- atomic_inc(&dd->aspm_disabled_cnt);
- spin_unlock_irqrestore(&dd->aspm_lock, flags);
-}
-
-static inline void aspm_enable_dec(struct hfi1_devdata *dd)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&dd->aspm_lock, flags);
- if (atomic_dec_and_test(&dd->aspm_disabled_cnt))
- aspm_enable(dd);
- spin_unlock_irqrestore(&dd->aspm_lock, flags);
-}
-
-/* ASPM processing for each receive context interrupt */
static inline void aspm_ctx_disable(struct hfi1_ctxtdata *rcd)
{
- bool restart_timer;
- bool close_interrupts;
- unsigned long flags;
- ktime_t now, prev;
-
/* Quickest exit for minimum impact */
- if (!rcd->aspm_intr_supported)
+ if (likely(!rcd->aspm_intr_supported))
return;
- spin_lock_irqsave(&rcd->aspm_lock, flags);
- /* PSM contexts are open */
- if (!rcd->aspm_intr_enable)
- goto unlock;
-
- prev = rcd->aspm_ts_last_intr;
- now = ktime_get();
- rcd->aspm_ts_last_intr = now;
-
- /* An interrupt pair close together in time */
- close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS;
-
- /* Don't push out our timer till this much time has elapsed */
- restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) >
- ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC;
- restart_timer = restart_timer && close_interrupts;
-
- /* Disable ASPM and schedule timer */
- if (rcd->aspm_enabled && close_interrupts) {
- aspm_disable_inc(rcd->dd);
- rcd->aspm_enabled = false;
- restart_timer = true;
- }
-
- if (restart_timer) {
- mod_timer(&rcd->aspm_timer,
- jiffies + msecs_to_jiffies(ASPM_TIMER_MS));
- rcd->aspm_ts_timer_sched = now;
- }
-unlock:
- spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-}
-
-/* Timer function for re-enabling ASPM in the absence of interrupt activity */
-static inline void aspm_ctx_timer_function(struct timer_list *t)
-{
- struct hfi1_ctxtdata *rcd = from_timer(rcd, t, aspm_timer);
- unsigned long flags;
-
- spin_lock_irqsave(&rcd->aspm_lock, flags);
- aspm_enable_dec(rcd->dd);
- rcd->aspm_enabled = true;
- spin_unlock_irqrestore(&rcd->aspm_lock, flags);
-}
-
-/*
- * Disable interrupt processing for verbs contexts when PSM or VNIC contexts
- * are open.
- */
-static inline void aspm_disable_all(struct hfi1_devdata *dd)
-{
- struct hfi1_ctxtdata *rcd;
- unsigned long flags;
- u16 i;
-
- for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
- rcd = hfi1_rcd_get_by_index(dd, i);
- if (rcd) {
- del_timer_sync(&rcd->aspm_timer);
- spin_lock_irqsave(&rcd->aspm_lock, flags);
- rcd->aspm_intr_enable = false;
- spin_unlock_irqrestore(&rcd->aspm_lock, flags);
- hfi1_rcd_put(rcd);
- }
- }
-
- aspm_disable(dd);
- atomic_set(&dd->aspm_disabled_cnt, 0);
-}
-
-/* Re-enable interrupt processing for verbs contexts */
-static inline void aspm_enable_all(struct hfi1_devdata *dd)
-{
- struct hfi1_ctxtdata *rcd;
- unsigned long flags;
- u16 i;
-
- aspm_enable(dd);
-
- if (aspm_mode != ASPM_MODE_DYNAMIC)
- return;
-
- for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
- rcd = hfi1_rcd_get_by_index(dd, i);
- if (rcd) {
- spin_lock_irqsave(&rcd->aspm_lock, flags);
- rcd->aspm_intr_enable = true;
- rcd->aspm_enabled = true;
- spin_unlock_irqrestore(&rcd->aspm_lock, flags);
- hfi1_rcd_put(rcd);
- }
- }
-}
-
-static inline void aspm_ctx_init(struct hfi1_ctxtdata *rcd)
-{
- spin_lock_init(&rcd->aspm_lock);
- timer_setup(&rcd->aspm_timer, aspm_ctx_timer_function, 0);
- rcd->aspm_intr_supported = rcd->dd->aspm_supported &&
- aspm_mode == ASPM_MODE_DYNAMIC &&
- rcd->ctxt < rcd->dd->first_dyn_alloc_ctxt;
-}
-
-static inline void aspm_init(struct hfi1_devdata *dd)
-{
- struct hfi1_ctxtdata *rcd;
- u16 i;
-
- spin_lock_init(&dd->aspm_lock);
- dd->aspm_supported = aspm_hw_l1_supported(dd);
-
- for (i = 0; i < dd->first_dyn_alloc_ctxt; i++) {
- rcd = hfi1_rcd_get_by_index(dd, i);
- if (rcd)
- aspm_ctx_init(rcd);
- hfi1_rcd_put(rcd);
- }
-
- /* Start with ASPM disabled */
- aspm_hw_set_l1_ent_latency(dd);
- dd->aspm_enabled = false;
- aspm_hw_disable_l1(dd);
-
- /* Now turn on ASPM if configured */
- aspm_enable_all(dd);
-}
-
-static inline void aspm_exit(struct hfi1_devdata *dd)
-{
- aspm_disable_all(dd);
-
- /* Turn on ASPM on exit to conserve power */
- aspm_enable(dd);
+ __aspm_ctx_disable(rcd);
}
#endif /* _ASPM_H */
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 902d12d..9b1fb84 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -67,8 +67,6 @@
#include "debugfs.h"
#include "fault.h"
-#define NUM_IB_PORTS 1
-
uint kdeth_qp;
module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
@@ -1074,6 +1072,8 @@
static void log_physical_state(struct hfi1_pportdata *ppd, u32 state);
static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state,
int msecs);
+static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd,
+ int msecs);
static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
static void handle_temp_err(struct hfi1_devdata *dd);
@@ -1100,9 +1100,9 @@
const char *desc;
};
-#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START)
-#define NUM_DC_ERRS (IS_DC_END - IS_DC_START)
-#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START)
+#define NUM_MISC_ERRS (IS_GENERAL_ERR_END + 1 - IS_GENERAL_ERR_START)
+#define NUM_DC_ERRS (IS_DC_END + 1 - IS_DC_START)
+#define NUM_VARIOUS (IS_VARIOUS_END + 1 - IS_VARIOUS_START)
/*
* Helpers for building HFI and DC error interrupt table entries. Different
@@ -4101,9 +4101,13 @@
def_access_ibp_counter(rdma_seq);
def_access_ibp_counter(unaligned);
def_access_ibp_counter(seq_naks);
+def_access_ibp_counter(rc_crwaits);
static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
[C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH),
+[C_RX_LEN_ERR] = RXE32_DEV_CNTR_ELEM(RxLenErr, RCV_LENGTH_ERR_CNT, CNTR_SYNTH),
+[C_RX_ICRC_ERR] = RXE32_DEV_CNTR_ELEM(RxICrcErr, RCV_ICRC_ERR_CNT, CNTR_SYNTH),
+[C_RX_EBP] = RXE32_DEV_CNTR_ELEM(RxEbpCnt, RCV_EBP_CNT, CNTR_SYNTH),
[C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT,
CNTR_NORMAL),
[C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT,
@@ -4253,6 +4257,8 @@
access_sw_pio_drain),
[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
access_sw_kmem_wait),
+[C_SW_TID_WAIT] = CNTR_ELEM("TidWait", 0, 0, CNTR_NORMAL,
+ hfi1_access_sw_tid_wait),
[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
access_sw_send_schedule),
[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
@@ -5114,6 +5120,7 @@
[C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq),
[C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned),
[C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks),
+[C_SW_IBP_RC_CRWAITS] = SW_IBP_CNTR(RcCrWait, rc_crwaits),
[C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL,
access_sw_cpu_rc_acks),
[C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL,
@@ -5222,6 +5229,17 @@
return (chip_rev_minor & 0xF0) == 0x10;
}
+/* return true is kernel urg disabled for rcd */
+bool is_urg_masked(struct hfi1_ctxtdata *rcd)
+{
+ u64 mask;
+ u32 is = IS_RCVURGENT_START + rcd->ctxt;
+ u8 bit = is % 64;
+
+ mask = read_csr(rcd->dd, CCE_INT_MASK + (8 * (is / 64)));
+ return !(mask & BIT_ULL(bit));
+}
+
/*
* Append string s to buffer buf. Arguments curp and len are the current
* position and remaining length, respectively.
@@ -8181,7 +8199,7 @@
/**
* is_rcv_urgent_int() - User receive context urgent IRQ handler
* @dd: valid dd
- * @source: logical IRQ source (ofse from IS_RCVURGENT_START)
+ * @source: logical IRQ source (offset from IS_RCVURGENT_START)
*
* RX block receive urgent interrupt. Source is < 160.
*
@@ -8231,7 +8249,7 @@
is_sdma_eng_err_name, is_sdma_eng_err_int },
{ IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END,
is_sendctxt_err_name, is_sendctxt_err_int },
-{ IS_SDMA_START, IS_SDMA_END,
+{ IS_SDMA_START, IS_SDMA_IDLE_END,
is_sdma_eng_name, is_sdma_eng_int },
{ IS_VARIOUS_START, IS_VARIOUS_END,
is_various_name, is_various_int },
@@ -8257,7 +8275,7 @@
/* avoids a double compare by walking the table in-order */
for (entry = &is_table[0]; entry->is_name; entry++) {
- if (source < entry->end) {
+ if (source <= entry->end) {
trace_hfi1_interrupt(dd, entry, source);
entry->is_int(dd, source - entry->start);
return;
@@ -8276,7 +8294,7 @@
* context DATA IRQs are threaded and are not supported by this handler.
*
*/
-static irqreturn_t general_interrupt(int irq, void *data)
+irqreturn_t general_interrupt(int irq, void *data)
{
struct hfi1_devdata *dd = data;
u64 regs[CCE_NUM_INT_CSRS];
@@ -8309,7 +8327,7 @@
return handled;
}
-static irqreturn_t sdma_interrupt(int irq, void *data)
+irqreturn_t sdma_interrupt(int irq, void *data)
{
struct sdma_engine *sde = data;
struct hfi1_devdata *dd = sde->dd;
@@ -8352,7 +8370,6 @@
struct hfi1_devdata *dd = rcd->dd;
u32 addr = CCE_INT_CLEAR + (8 * rcd->ireg);
- mmiowb(); /* make sure everything before is written */
write_csr(dd, addr, rcd->imask);
/* force the above write on the chip and get a value back */
(void)read_csr(dd, addr);
@@ -8401,7 +8418,7 @@
* invoked) is finished. The intent is to avoid extra interrupts while we
* are processing packets anyway.
*/
-static irqreturn_t receive_context_interrupt(int irq, void *data)
+irqreturn_t receive_context_interrupt(int irq, void *data)
{
struct hfi1_ctxtdata *rcd = data;
struct hfi1_devdata *dd = rcd->dd;
@@ -8441,7 +8458,7 @@
* Receive packet thread handler. This expects to be invoked with the
* receive interrupt still blocked.
*/
-static irqreturn_t receive_context_thread(int irq, void *data)
+irqreturn_t receive_context_thread(int irq, void *data)
{
struct hfi1_ctxtdata *rcd = data;
int present;
@@ -9651,30 +9668,10 @@
}
}
-static void init_qsfp_int(struct hfi1_devdata *dd)
+void init_qsfp_int(struct hfi1_devdata *dd)
{
struct hfi1_pportdata *ppd = dd->pport;
- u64 qsfp_mask, cce_int_mask;
- const int qsfp1_int_smask = QSFP1_INT % 64;
- const int qsfp2_int_smask = QSFP2_INT % 64;
-
- /*
- * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
- * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
- * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
- * the index of the appropriate CSR in the CCEIntMask CSR array
- */
- cce_int_mask = read_csr(dd, CCE_INT_MASK +
- (8 * (QSFP1_INT / 64)));
- if (dd->hfi1_id) {
- cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
- write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)),
- cce_int_mask);
- } else {
- cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
- write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)),
- cce_int_mask);
- }
+ u64 qsfp_mask;
qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
/* Clear current status to avoid spurious interrupts */
@@ -9691,6 +9688,12 @@
write_csr(dd,
dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
qsfp_mask);
+
+ /* Enable the appropriate QSFP IRQ source */
+ if (!dd->hfi1_id)
+ set_intr_bits(dd, QSFP1_INT, QSFP1_INT, true);
+ else
+ set_intr_bits(dd, QSFP2_INT, QSFP2_INT, true);
}
/*
@@ -9849,6 +9852,7 @@
/* disable the port */
clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+ cancel_work_sync(&ppd->freeze_work);
}
static inline int init_cpu_counters(struct hfi1_devdata *dd)
@@ -10577,12 +10581,29 @@
}
}
-/*
- * Verify if BCT for data VLs is non-zero.
+/**
+ * data_vls_operational() - Verify if data VL BCT credits and MTU
+ * are both set.
+ * @ppd: pointer to hfi1_pportdata structure
+ *
+ * Return: true - Ok, false -otherwise.
*/
static inline bool data_vls_operational(struct hfi1_pportdata *ppd)
{
- return !!ppd->actual_vls_operational;
+ int i;
+ u64 reg;
+
+ if (!ppd->actual_vls_operational)
+ return false;
+
+ for (i = 0; i < ppd->vls_supported; i++) {
+ reg = read_csr(ppd->dd, SEND_CM_CREDIT_VL + (8 * i));
+ if ((reg && !ppd->dd->vld[i].mtu) ||
+ (!reg && ppd->dd->vld[i].mtu))
+ return false;
+ }
+
+ return true;
}
/*
@@ -10695,7 +10716,8 @@
if (!data_vls_operational(ppd)) {
dd_dev_err(dd,
- "%s: data VLs not operational\n", __func__);
+ "%s: Invalid data VL credits or mtu\n",
+ __func__);
ret = -EINVAL;
break;
}
@@ -10768,13 +10790,15 @@
break;
ppd->port_error_action = 0;
- ppd->host_link_state = HLS_DN_POLL;
if (quick_linkup) {
/* quick linkup does not go into polling */
ret = do_quick_linkup(dd);
} else {
ret1 = set_physical_link_state(dd, PLS_POLLING);
+ if (!ret1)
+ ret1 = wait_phys_link_out_of_offline(ppd,
+ 3000);
if (ret1 != HCMD_SUCCESS) {
dd_dev_err(dd,
"Failed to transition to Polling link state, return 0x%x\n",
@@ -10782,6 +10806,14 @@
ret = -EINVAL;
}
}
+
+ /*
+ * Change the host link state after requesting DC8051 to
+ * change its physical state so that we can ignore any
+ * interrupt with stale LNI(XX) error, which will not be
+ * cleared until DC8051 transitions to Polling state.
+ */
+ ppd->host_link_state = HLS_DN_POLL;
ppd->offline_disabled_reason =
HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
/*
@@ -11776,12 +11808,10 @@
<< RCV_EGR_INDEX_HEAD_HEAD_SHIFT;
write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg);
}
- mmiowb();
reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) |
(((u64)hd & RCV_HDR_HEAD_HEAD_MASK)
<< RCV_HDR_HEAD_HEAD_SHIFT);
write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
- mmiowb();
}
u32 hdrqempty(struct hfi1_ctxtdata *rcd)
@@ -11932,10 +11962,16 @@
rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
}
- if (op & HFI1_RCVCTRL_INTRAVAIL_ENB)
+ if (op & HFI1_RCVCTRL_INTRAVAIL_ENB) {
+ set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt,
+ IS_RCVAVAIL_START + rcd->ctxt, true);
rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
- if (op & HFI1_RCVCTRL_INTRAVAIL_DIS)
+ }
+ if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) {
+ set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt,
+ IS_RCVAVAIL_START + rcd->ctxt, false);
rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+ }
if ((op & HFI1_RCVCTRL_TAILUPD_ENB) && rcd->rcvhdrtail_kvaddr)
rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
if (op & HFI1_RCVCTRL_TAILUPD_DIS) {
@@ -11965,6 +12001,13 @@
rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+ if (op & HFI1_RCVCTRL_URGENT_ENB)
+ set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt,
+ IS_RCVURGENT_START + rcd->ctxt, true);
+ if (op & HFI1_RCVCTRL_URGENT_DIS)
+ set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt,
+ IS_RCVURGENT_START + rcd->ctxt, false);
+
hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl);
write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcvctrl);
@@ -12913,6 +12956,39 @@
return read_state;
}
+/*
+ * wait_phys_link_out_of_offline - wait for any out of offline state
+ * @ppd: port device
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for any out of offline physical link
+ * state change to occur.
+ * Returns 0 if at least one state is reached, otherwise -ETIMEDOUT.
+ */
+static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd,
+ int msecs)
+{
+ u32 read_state;
+ unsigned long timeout;
+
+ timeout = jiffies + msecs_to_jiffies(msecs);
+ while (1) {
+ read_state = read_physical_state(ppd->dd);
+ if ((read_state & 0xF0) != PLS_OFFLINE)
+ break;
+ if (time_after(jiffies, timeout)) {
+ dd_dev_err(ppd->dd,
+ "timeout waiting for phy link out of offline. Read state 0x%x, %dms\n",
+ read_state, msecs);
+ return -ETIMEDOUT;
+ }
+ usleep_range(1950, 2050); /* sleep 2ms-ish */
+ }
+
+ log_state_transition(ppd, read_state);
+ return read_state;
+}
+
#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
@@ -12964,63 +13040,71 @@
return ret;
}
-/**
- * get_int_mask - get 64 bit int mask
- * @dd - the devdata
- * @i - the csr (relative to CCE_INT_MASK)
- *
- * Returns the mask with the urgent interrupt mask
- * bit clear for kernel receive contexts.
- */
-static u64 get_int_mask(struct hfi1_devdata *dd, u32 i)
-{
- u64 mask = U64_MAX; /* default to no change */
-
- if (i >= (IS_RCVURGENT_START / 64) && i < (IS_RCVURGENT_END / 64)) {
- int j = (i - (IS_RCVURGENT_START / 64)) * 64;
- int k = !j ? IS_RCVURGENT_START % 64 : 0;
-
- if (j)
- j -= IS_RCVURGENT_START % 64;
- /* j = 0..dd->first_dyn_alloc_ctxt - 1,k = 0..63 */
- for (; j < dd->first_dyn_alloc_ctxt && k < 64; j++, k++)
- /* convert to bit in mask and clear */
- mask &= ~BIT_ULL(k);
- }
- return mask;
-}
-
/* ========================================================================= */
-/*
- * Enable/disable chip from delivering interrupts.
+/**
+ * read_mod_write() - Calculate the IRQ register index and set/clear the bits
+ * @dd: valid devdata
+ * @src: IRQ source to determine register index from
+ * @bits: the bits to set or clear
+ * @set: true == set the bits, false == clear the bits
+ *
*/
-void set_intr_state(struct hfi1_devdata *dd, u32 enable)
+static void read_mod_write(struct hfi1_devdata *dd, u16 src, u64 bits,
+ bool set)
{
- int i;
+ u64 reg;
+ u16 idx = src / BITS_PER_REGISTER;
- /*
- * In HFI, the mask needs to be 1 to allow interrupts.
- */
- if (enable) {
- /* enable all interrupts but urgent on kernel contexts */
- for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
- u64 mask = get_int_mask(dd, i);
+ spin_lock(&dd->irq_src_lock);
+ reg = read_csr(dd, CCE_INT_MASK + (8 * idx));
+ if (set)
+ reg |= bits;
+ else
+ reg &= ~bits;
+ write_csr(dd, CCE_INT_MASK + (8 * idx), reg);
+ spin_unlock(&dd->irq_src_lock);
+}
- write_csr(dd, CCE_INT_MASK + (8 * i), mask);
+/**
+ * set_intr_bits() - Enable/disable a range (one or more) IRQ sources
+ * @dd: valid devdata
+ * @first: first IRQ source to set/clear
+ * @last: last IRQ source (inclusive) to set/clear
+ * @set: true == set the bits, false == clear the bits
+ *
+ * If first == last, set the exact source.
+ */
+int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set)
+{
+ u64 bits = 0;
+ u64 bit;
+ u16 src;
+
+ if (first > NUM_INTERRUPT_SOURCES || last > NUM_INTERRUPT_SOURCES)
+ return -EINVAL;
+
+ if (last < first)
+ return -ERANGE;
+
+ for (src = first; src <= last; src++) {
+ bit = src % BITS_PER_REGISTER;
+ /* wrapped to next register? */
+ if (!bit && bits) {
+ read_mod_write(dd, src - 1, bits, set);
+ bits = 0;
}
-
- init_qsfp_int(dd);
- } else {
- for (i = 0; i < CCE_NUM_INT_CSRS; i++)
- write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
+ bits |= BIT_ULL(bit);
}
+ read_mod_write(dd, last, bits, set);
+
+ return 0;
}
/*
* Clear all interrupt sources on the chip.
*/
-static void clear_all_interrupts(struct hfi1_devdata *dd)
+void clear_all_interrupts(struct hfi1_devdata *dd)
{
int i;
@@ -13044,38 +13128,11 @@
write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0);
}
-/**
- * hfi1_clean_up_interrupts() - Free all IRQ resources
- * @dd: valid device data data structure
- *
- * Free the MSIx and assoicated PCI resources, if they have been allocated.
- */
-void hfi1_clean_up_interrupts(struct hfi1_devdata *dd)
-{
- int i;
- struct hfi1_msix_entry *me = dd->msix_entries;
-
- /* remove irqs - must happen before disabling/turning off */
- for (i = 0; i < dd->num_msix_entries; i++, me++) {
- if (!me->arg) /* => no irq, no affinity */
- continue;
- hfi1_put_irq_affinity(dd, me);
- pci_free_irq(dd->pcidev, i, me->arg);
- }
-
- /* clean structures */
- kfree(dd->msix_entries);
- dd->msix_entries = NULL;
- dd->num_msix_entries = 0;
-
- pci_free_irq_vectors(dd->pcidev);
-}
-
/*
* Remap the interrupt source from the general handler to the given MSI-X
* interrupt.
*/
-static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
+void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
{
u64 reg;
int m, n;
@@ -13099,8 +13156,7 @@
write_csr(dd, CCE_INT_MAP + (8 * m), reg);
}
-static void remap_sdma_interrupts(struct hfi1_devdata *dd,
- int engine, int msix_intr)
+void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr)
{
/*
* SDMA engine interrupt sources grouped by type, rather than
@@ -13109,204 +13165,16 @@
* SDMAProgress
* SDMAIdle
*/
- remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine,
- msix_intr);
- remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine,
- msix_intr);
- remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine,
- msix_intr);
-}
-
-static int request_msix_irqs(struct hfi1_devdata *dd)
-{
- int first_general, last_general;
- int first_sdma, last_sdma;
- int first_rx, last_rx;
- int i, ret = 0;
-
- /* calculate the ranges we are going to use */
- first_general = 0;
- last_general = first_general + 1;
- first_sdma = last_general;
- last_sdma = first_sdma + dd->num_sdma;
- first_rx = last_sdma;
- last_rx = first_rx + dd->n_krcv_queues + dd->num_vnic_contexts;
-
- /* VNIC MSIx interrupts get mapped when VNIC contexts are created */
- dd->first_dyn_msix_idx = first_rx + dd->n_krcv_queues;
-
- /*
- * Sanity check - the code expects all SDMA chip source
- * interrupts to be in the same CSR, starting at bit 0. Verify
- * that this is true by checking the bit location of the start.
- */
- BUILD_BUG_ON(IS_SDMA_START % 64);
-
- for (i = 0; i < dd->num_msix_entries; i++) {
- struct hfi1_msix_entry *me = &dd->msix_entries[i];
- const char *err_info;
- irq_handler_t handler;
- irq_handler_t thread = NULL;
- void *arg = NULL;
- int idx;
- struct hfi1_ctxtdata *rcd = NULL;
- struct sdma_engine *sde = NULL;
- char name[MAX_NAME_SIZE];
-
- /* obtain the arguments to pci_request_irq */
- if (first_general <= i && i < last_general) {
- idx = i - first_general;
- handler = general_interrupt;
- arg = dd;
- snprintf(name, sizeof(name),
- DRIVER_NAME "_%d", dd->unit);
- err_info = "general";
- me->type = IRQ_GENERAL;
- } else if (first_sdma <= i && i < last_sdma) {
- idx = i - first_sdma;
- sde = &dd->per_sdma[idx];
- handler = sdma_interrupt;
- arg = sde;
- snprintf(name, sizeof(name),
- DRIVER_NAME "_%d sdma%d", dd->unit, idx);
- err_info = "sdma";
- remap_sdma_interrupts(dd, idx, i);
- me->type = IRQ_SDMA;
- } else if (first_rx <= i && i < last_rx) {
- idx = i - first_rx;
- rcd = hfi1_rcd_get_by_index_safe(dd, idx);
- if (rcd) {
- /*
- * Set the interrupt register and mask for this
- * context's interrupt.
- */
- rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
- rcd->imask = ((u64)1) <<
- ((IS_RCVAVAIL_START + idx) % 64);
- handler = receive_context_interrupt;
- thread = receive_context_thread;
- arg = rcd;
- snprintf(name, sizeof(name),
- DRIVER_NAME "_%d kctxt%d",
- dd->unit, idx);
- err_info = "receive context";
- remap_intr(dd, IS_RCVAVAIL_START + idx, i);
- me->type = IRQ_RCVCTXT;
- rcd->msix_intr = i;
- hfi1_rcd_put(rcd);
- }
- } else {
- /* not in our expected range - complain, then
- * ignore it
- */
- dd_dev_err(dd,
- "Unexpected extra MSI-X interrupt %d\n", i);
- continue;
- }
- /* no argument, no interrupt */
- if (!arg)
- continue;
- /* make sure the name is terminated */
- name[sizeof(name) - 1] = 0;
- me->irq = pci_irq_vector(dd->pcidev, i);
- ret = pci_request_irq(dd->pcidev, i, handler, thread, arg,
- name);
- if (ret) {
- dd_dev_err(dd,
- "unable to allocate %s interrupt, irq %d, index %d, err %d\n",
- err_info, me->irq, idx, ret);
- return ret;
- }
- /*
- * assign arg after pci_request_irq call, so it will be
- * cleaned up
- */
- me->arg = arg;
-
- ret = hfi1_get_irq_affinity(dd, me);
- if (ret)
- dd_dev_err(dd, "unable to pin IRQ %d\n", ret);
- }
-
- return ret;
-}
-
-void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd)
-{
- int i;
-
- for (i = 0; i < dd->vnic.num_ctxt; i++) {
- struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i];
- struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr];
-
- synchronize_irq(me->irq);
- }
-}
-
-void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd)
-{
- struct hfi1_devdata *dd = rcd->dd;
- struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr];
-
- if (!me->arg) /* => no irq, no affinity */
- return;
-
- hfi1_put_irq_affinity(dd, me);
- pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg);
-
- me->arg = NULL;
-}
-
-void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd)
-{
- struct hfi1_devdata *dd = rcd->dd;
- struct hfi1_msix_entry *me;
- int idx = rcd->ctxt;
- void *arg = rcd;
- int ret;
-
- rcd->msix_intr = dd->vnic.msix_idx++;
- me = &dd->msix_entries[rcd->msix_intr];
-
- /*
- * Set the interrupt register and mask for this
- * context's interrupt.
- */
- rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
- rcd->imask = ((u64)1) <<
- ((IS_RCVAVAIL_START + idx) % 64);
- me->type = IRQ_RCVCTXT;
- me->irq = pci_irq_vector(dd->pcidev, rcd->msix_intr);
- remap_intr(dd, IS_RCVAVAIL_START + idx, rcd->msix_intr);
-
- ret = pci_request_irq(dd->pcidev, rcd->msix_intr,
- receive_context_interrupt,
- receive_context_thread, arg,
- DRIVER_NAME "_%d kctxt%d", dd->unit, idx);
- if (ret) {
- dd_dev_err(dd, "vnic irq request (irq %d, idx %d) fail %d\n",
- me->irq, idx, ret);
- return;
- }
- /*
- * assign arg after pci_request_irq call, so it will be
- * cleaned up
- */
- me->arg = arg;
-
- ret = hfi1_get_irq_affinity(dd, me);
- if (ret) {
- dd_dev_err(dd,
- "unable to pin IRQ %d\n", ret);
- pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg);
- }
+ remap_intr(dd, IS_SDMA_START + engine, msix_intr);
+ remap_intr(dd, IS_SDMA_PROGRESS_START + engine, msix_intr);
+ remap_intr(dd, IS_SDMA_IDLE_START + engine, msix_intr);
}
/*
* Set the general handler to accept all interrupts, remap all
* chip interrupts back to MSI-X 0.
*/
-static void reset_interrupts(struct hfi1_devdata *dd)
+void reset_interrupts(struct hfi1_devdata *dd)
{
int i;
@@ -13319,54 +13187,33 @@
write_csr(dd, CCE_INT_MAP + (8 * i), 0);
}
+/**
+ * set_up_interrupts() - Initialize the IRQ resources and state
+ * @dd: valid devdata
+ *
+ */
static int set_up_interrupts(struct hfi1_devdata *dd)
{
- u32 total;
- int ret, request;
-
- /*
- * Interrupt count:
- * 1 general, "slow path" interrupt (includes the SDMA engines
- * slow source, SDMACleanupDone)
- * N interrupts - one per used SDMA engine
- * M interrupt - one per kernel receive context
- * V interrupt - one for each VNIC context
- */
- total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts;
-
- /* ask for MSI-X interrupts */
- request = request_msix(dd, total);
- if (request < 0) {
- ret = request;
- goto fail;
- } else {
- dd->msix_entries = kcalloc(total, sizeof(*dd->msix_entries),
- GFP_KERNEL);
- if (!dd->msix_entries) {
- ret = -ENOMEM;
- goto fail;
- }
- /* using MSI-X */
- dd->num_msix_entries = total;
- dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
- }
+ int ret;
/* mask all interrupts */
- set_intr_state(dd, 0);
+ set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false);
+
/* clear all pending interrupts */
clear_all_interrupts(dd);
/* reset general handler mask, chip MSI-X mappings */
reset_interrupts(dd);
- ret = request_msix_irqs(dd);
+ /* ask for MSI-X interrupts */
+ ret = msix_initialize(dd);
if (ret)
- goto fail;
+ return ret;
- return 0;
+ ret = msix_request_irqs(dd);
+ if (ret)
+ msix_clean_up_interrupts(dd);
-fail:
- hfi1_clean_up_interrupts(dd);
return ret;
}
@@ -13388,7 +13235,7 @@
int total_contexts;
int ret;
unsigned ngroups;
- int qos_rmt_count;
+ int rmt_count;
int user_rmt_reduced;
u32 n_usr_ctxts;
u32 send_contexts = chip_send_contexts(dd);
@@ -13450,10 +13297,23 @@
n_usr_ctxts = rcv_contexts - total_contexts;
}
- /* each user context requires an entry in the RMT */
- qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
- if (qos_rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
- user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+ /*
+ * The RMT entries are currently allocated as shown below:
+ * 1. QOS (0 to 128 entries);
+ * 2. FECN (num_kernel_context - 1 + num_user_contexts +
+ * num_vnic_contexts);
+ * 3. VNIC (num_vnic_contexts).
+ * It should be noted that FECN oversubscribe num_vnic_contexts
+ * entries of RMT because both VNIC and PSM could allocate any receive
+ * context between dd->first_dyn_alloc_text and dd->num_rcv_contexts,
+ * and PSM FECN must reserve an RMT entry for each possible PSM receive
+ * context.
+ */
+ rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_vnic_contexts * 2);
+ if (HFI1_CAP_IS_KSET(TID_RDMA))
+ rmt_count += num_kernel_contexts - 1;
+ if (rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
+ user_rmt_reduced = NUM_MAP_ENTRIES - rmt_count;
dd_dev_err(dd,
"RMT size is reducing the number of user receive contexts from %u to %d\n",
n_usr_ctxts,
@@ -14174,6 +14034,19 @@
}
/**
+ * hfi1_get_qp_map
+ * @dd: device data
+ * @idx: index to read
+ */
+u8 hfi1_get_qp_map(struct hfi1_devdata *dd, u8 idx)
+{
+ u64 reg = read_csr(dd, RCV_QP_MAP_TABLE + (idx / 8) * 8);
+
+ reg >>= (idx % 8) * 8;
+ return reg;
+}
+
+/**
* init_qpmap_table
* @dd - device data
* @first_ctxt - first context
@@ -14434,35 +14307,43 @@
init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
}
-static void init_user_fecn_handling(struct hfi1_devdata *dd,
- struct rsm_map_table *rmt)
+static void init_fecn_handling(struct hfi1_devdata *dd,
+ struct rsm_map_table *rmt)
{
struct rsm_rule_data rrd;
u64 reg;
- int i, idx, regoff, regidx;
+ int i, idx, regoff, regidx, start;
u8 offset;
+ u32 total_cnt;
+
+ if (HFI1_CAP_IS_KSET(TID_RDMA))
+ /* Exclude context 0 */
+ start = 1;
+ else
+ start = dd->first_dyn_alloc_ctxt;
+
+ total_cnt = dd->num_rcv_contexts - start;
/* there needs to be enough room in the map table */
- if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
- dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+ if (rmt->used + total_cnt >= NUM_MAP_ENTRIES) {
+ dd_dev_err(dd, "FECN handling disabled - too many contexts allocated\n");
return;
}
/*
* RSM will extract the destination context as an index into the
* map table. The destination contexts are a sequential block
- * in the range first_dyn_alloc_ctxt...num_rcv_contexts-1 (inclusive).
+ * in the range start...num_rcv_contexts-1 (inclusive).
* Map entries are accessed as offset + extracted value. Adjust
* the added offset so this sequence can be placed anywhere in
* the table - as long as the entries themselves do not wrap.
* There are only enough bits in offset for the table size, so
* start with that to allow for a "negative" offset.
*/
- offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
- (int)dd->first_dyn_alloc_ctxt);
+ offset = (u8)(NUM_MAP_ENTRIES + rmt->used - start);
- for (i = dd->first_dyn_alloc_ctxt, idx = rmt->used;
- i < dd->num_rcv_contexts; i++, idx++) {
+ for (i = start, idx = rmt->used; i < dd->num_rcv_contexts;
+ i++, idx++) {
/* replace with identity mapping */
regoff = (idx % 8) * 8;
regidx = idx / 8;
@@ -14497,7 +14378,7 @@
/* add rule 1 */
add_rsm_rule(dd, RSM_INS_FECN, &rrd);
- rmt->used += dd->num_user_contexts;
+ rmt->used += total_cnt;
}
/* Initialize RSM for VNIC */
@@ -14573,7 +14454,7 @@
clear_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
}
-static void init_rxe(struct hfi1_devdata *dd)
+static int init_rxe(struct hfi1_devdata *dd)
{
struct rsm_map_table *rmt;
u64 val;
@@ -14582,9 +14463,12 @@
write_csr(dd, RCV_ERR_MASK, ~0ull);
rmt = alloc_rsm_map_table(dd);
+ if (!rmt)
+ return -ENOMEM;
+
/* set up QOS, including the QPN map table */
init_qos(dd, rmt);
- init_user_fecn_handling(dd, rmt);
+ init_fecn_handling(dd, rmt);
complete_rsm_map_table(dd, rmt);
/* record number of used rsm map entries for vnic */
dd->vnic.rmt_start = rmt->used;
@@ -14608,6 +14492,7 @@
val |= ((4ull & RCV_BYPASS_HDR_SIZE_MASK) <<
RCV_BYPASS_HDR_SIZE_SHIFT);
write_csr(dd, RCV_BYPASS, val);
+ return 0;
}
static void init_other(struct hfi1_devdata *dd)
@@ -14810,8 +14695,8 @@
*/
static int init_asic_data(struct hfi1_devdata *dd)
{
- unsigned long flags;
- struct hfi1_devdata *tmp, *peer = NULL;
+ unsigned long index;
+ struct hfi1_devdata *peer;
struct hfi1_asic_data *asic_data;
int ret = 0;
@@ -14820,14 +14705,12 @@
if (!asic_data)
return -ENOMEM;
- spin_lock_irqsave(&hfi1_devs_lock, flags);
+ xa_lock_irq(&hfi1_dev_table);
/* Find our peer device */
- list_for_each_entry(tmp, &hfi1_dev_list, list) {
- if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) &&
- dd->unit != tmp->unit) {
- peer = tmp;
+ xa_for_each(&hfi1_dev_table, index, peer) {
+ if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(peer)) &&
+ dd->unit != peer->unit)
break;
- }
}
if (peer) {
@@ -14839,7 +14722,7 @@
mutex_init(&dd->asic_data->asic_resource_mutex);
}
dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */
- spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+ xa_unlock_irq(&hfi1_dev_table);
/* first one through - set up i2c devices */
if (!peer)
@@ -14919,20 +14802,16 @@
}
/**
- * Allocate and initialize the device structure for the hfi.
+ * hfi1_init_dd() - Initialize most of the dd structure.
* @dev: the pci_dev for hfi1_ib device
* @ent: pci_device_id struct for this dev
*
- * Also allocates, initializes, and returns the devdata struct for this
- * device instance
- *
* This is global, and is called directly at init to set up the
* chip-specific function pointers for later use.
*/
-struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
- const struct pci_device_id *ent)
+int hfi1_init_dd(struct hfi1_devdata *dd)
{
- struct hfi1_devdata *dd;
+ struct pci_dev *pdev = dd->pcidev;
struct hfi1_pportdata *ppd;
u64 reg;
int i, ret;
@@ -14943,13 +14822,8 @@
"Functional simulator"
};
struct pci_dev *parent = pdev->bus->self;
- u32 sdma_engines;
+ u32 sdma_engines = chip_sdma_engines(dd);
- dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
- sizeof(struct hfi1_pportdata));
- if (IS_ERR(dd))
- goto bail;
- sdma_engines = chip_sdma_engines(dd);
ppd = dd->pport;
for (i = 0; i < dd->num_pports; i++, ppd++) {
int vl;
@@ -15128,6 +15002,12 @@
if (ret)
goto bail_cleanup;
+ /*
+ * This should probably occur in hfi1_pcie_init(), but historically
+ * occurs after the do_pcie_gen3_transition() code.
+ */
+ tune_pcie_caps(dd);
+
/* start setting dd values and adjusting CSRs */
init_early_variables(dd);
@@ -15150,7 +15030,10 @@
goto bail_cleanup;
/* set initial RXE CSRs */
- init_rxe(dd);
+ ret = init_rxe(dd);
+ if (ret)
+ goto bail_cleanup;
+
/* set initial TXE CSRs */
init_txe(dd);
/* set initial non-RXE, non-TXE CSRs */
@@ -15240,14 +15123,13 @@
free_cntrs(dd);
bail_clear_intr:
hfi1_comp_vectors_clean_up(dd);
- hfi1_clean_up_interrupts(dd);
+ msix_clean_up_interrupts(dd);
bail_cleanup:
hfi1_pcie_ddcleanup(dd);
bail_free:
hfi1_free_devdata(dd);
- dd = ERR_PTR(ret);
bail:
- return dd;
+ return ret;
}
static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate,
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
index 36b04d6..4ca5ac8 100644
--- a/drivers/infiniband/hw/hfi1/chip.h
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -1,7 +1,7 @@
#ifndef _CHIP_H
#define _CHIP_H
/*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -52,9 +52,7 @@
*/
/* sizes */
-#define CCE_NUM_MSIX_VECTORS 256
-#define CCE_NUM_INT_CSRS 12
-#define CCE_NUM_INT_MAP_CSRS 96
+#define BITS_PER_REGISTER (BITS_PER_BYTE * sizeof(u64))
#define NUM_INTERRUPT_SOURCES 768
#define RXE_NUM_CONTEXTS 160
#define RXE_PER_CONTEXT_SIZE 0x1000 /* 4k */
@@ -161,34 +159,49 @@
(CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \
CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT)
-/* interrupt source numbers */
-#define IS_GENERAL_ERR_START 0
-#define IS_SDMAENG_ERR_START 16
-#define IS_SENDCTXT_ERR_START 32
-#define IS_SDMA_START 192 /* includes SDmaProgress,SDmaIdle */
+/* Specific IRQ sources */
+#define CCE_ERR_INT 0
+#define RXE_ERR_INT 1
+#define MISC_ERR_INT 2
+#define PIO_ERR_INT 4
+#define SDMA_ERR_INT 5
+#define EGRESS_ERR_INT 6
+#define TXE_ERR_INT 7
+#define PBC_INT 240
+#define GPIO_ASSERT_INT 241
+#define QSFP1_INT 242
+#define QSFP2_INT 243
+#define TCRIT_INT 244
+
+/* interrupt source ranges */
+#define IS_FIRST_SOURCE CCE_ERR_INT
+#define IS_GENERAL_ERR_START 0
+#define IS_SDMAENG_ERR_START 16
+#define IS_SENDCTXT_ERR_START 32
+#define IS_SDMA_START 192
+#define IS_SDMA_PROGRESS_START 208
+#define IS_SDMA_IDLE_START 224
#define IS_VARIOUS_START 240
#define IS_DC_START 248
#define IS_RCVAVAIL_START 256
#define IS_RCVURGENT_START 416
#define IS_SENDCREDIT_START 576
#define IS_RESERVED_START 736
-#define IS_MAX_SOURCES 768
+#define IS_LAST_SOURCE 767
/* derived interrupt source values */
-#define IS_GENERAL_ERR_END IS_SDMAENG_ERR_START
-#define IS_SDMAENG_ERR_END IS_SENDCTXT_ERR_START
-#define IS_SENDCTXT_ERR_END IS_SDMA_START
-#define IS_SDMA_END IS_VARIOUS_START
-#define IS_VARIOUS_END IS_DC_START
-#define IS_DC_END IS_RCVAVAIL_START
-#define IS_RCVAVAIL_END IS_RCVURGENT_START
-#define IS_RCVURGENT_END IS_SENDCREDIT_START
-#define IS_SENDCREDIT_END IS_RESERVED_START
-#define IS_RESERVED_END IS_MAX_SOURCES
-
-/* absolute interrupt numbers for QSFP1Int and QSFP2Int */
-#define QSFP1_INT 242
-#define QSFP2_INT 243
+#define IS_GENERAL_ERR_END 7
+#define IS_SDMAENG_ERR_END 31
+#define IS_SENDCTXT_ERR_END 191
+#define IS_SDMA_END 207
+#define IS_SDMA_PROGRESS_END 223
+#define IS_SDMA_IDLE_END 239
+#define IS_VARIOUS_END 244
+#define IS_DC_END 255
+#define IS_RCVAVAIL_END 415
+#define IS_RCVURGENT_END 575
+#define IS_SENDCREDIT_END 735
+#define IS_RESERVED_END IS_LAST_SOURCE
/* DCC_CFG_PORT_CONFIG logical link states */
#define LSTATE_DOWN 0x1
@@ -791,6 +804,7 @@
u32 hdrqempty(struct hfi1_ctxtdata *rcd);
int is_ax(struct hfi1_devdata *dd);
int is_bx(struct hfi1_devdata *dd);
+bool is_urg_masked(struct hfi1_ctxtdata *rcd);
u32 read_physical_state(struct hfi1_devdata *dd);
u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate);
const char *opa_lstate_name(u32 lstate);
@@ -844,6 +858,9 @@
/* Per device counter indexes */
enum {
C_RCV_OVF = 0,
+ C_RX_LEN_ERR,
+ C_RX_ICRC_ERR,
+ C_RX_EBP,
C_RX_TID_FULL,
C_RX_TID_INVALID,
C_RX_TID_FLGMS,
@@ -913,6 +930,7 @@
C_SW_PIO_WAIT,
C_SW_PIO_DRAIN,
C_SW_KMEM_WAIT,
+ C_SW_TID_WAIT,
C_SW_SEND_SCHED,
C_SDMA_DESC_FETCHED_CNT,
C_SDMA_INT_CNT,
@@ -1227,6 +1245,7 @@
C_SW_IBP_RDMA_SEQ,
C_SW_IBP_UNALIGNED,
C_SW_IBP_SEQ_NAK,
+ C_SW_IBP_RC_CRWAITS,
C_SW_CPU_RC_ACKS,
C_SW_CPU_RC_QACKS,
C_SW_CPU_RC_DELAYED_COMP,
@@ -1416,6 +1435,19 @@
void hfi1_init_vnic_rsm(struct hfi1_devdata *dd);
void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd);
+irqreturn_t general_interrupt(int irq, void *data);
+irqreturn_t sdma_interrupt(int irq, void *data);
+irqreturn_t receive_context_interrupt(int irq, void *data);
+irqreturn_t receive_context_thread(int irq, void *data);
+
+int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set);
+void init_qsfp_int(struct hfi1_devdata *dd);
+void clear_all_interrupts(struct hfi1_devdata *dd);
+void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr);
+void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr);
+void reset_interrupts(struct hfi1_devdata *dd);
+u8 hfi1_get_qp_map(struct hfi1_devdata *dd, u8 idx);
+
/*
* Interrupt source table.
*
diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h
index ee6dca5..ab3589d 100644
--- a/drivers/infiniband/hw/hfi1/chip_registers.h
+++ b/drivers/infiniband/hw/hfi1/chip_registers.h
@@ -380,6 +380,9 @@
#define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418)
#define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468)
#define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0)
+#define RCV_LENGTH_ERR_CNT 0
+#define RCV_ICRC_ERR_CNT 6
+#define RCV_EBP_CNT 9
#define RCV_BUF_OVFL_CNT 10
#define RCV_CONTEXT_EGR_STALL 22
#define RCV_DATA_PKT_CNT 0
@@ -878,6 +881,10 @@
#define SEND_CTRL (TXE + 0x000000000000)
#define SEND_CTRL_CM_RESET_SMASK 0x4ull
#define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull
+#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
+#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xFFull
+#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
+ << SEND_CTRL_UNSUPPORTED_VL_SHIFT)
#define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull
#define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080)
#define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull
@@ -931,6 +938,10 @@
#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull
#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0
#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull
+#define SEND_CTXT_CREDIT_STATUS (TXE + 0x000000100018)
+#define SEND_CTXT_CREDIT_STATUS_CURRENT_FREE_COUNTER_MASK 0x7FFull
+#define SEND_CTXT_CREDIT_STATUS_CURRENT_FREE_COUNTER_SHIFT 32
+#define SEND_CTXT_CREDIT_STATUS_LAST_RETURNED_COUNTER_SMASK 0x7FFull
#define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028)
#define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull
#define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020)
diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
index 7108d4d..d47da7b 100644
--- a/drivers/infiniband/hw/hfi1/common.h
+++ b/drivers/infiniband/hw/hfi1/common.h
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2015, 2016 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -136,18 +136,21 @@
HFI1_CAP_ALLOW_PERM_JKEY | \
HFI1_CAP_STATIC_RATE_CTRL | \
HFI1_CAP_PRINT_UNIMPL | \
- HFI1_CAP_TID_UNMAP)
+ HFI1_CAP_TID_UNMAP | \
+ HFI1_CAP_OPFN)
/*
* A set of capability bits that are "global" and are not allowed to be
* set in the user bitmask.
*/
#define HFI1_CAP_RESERVED_MASK ((HFI1_CAP_SDMA | \
- HFI1_CAP_USE_SDMA_HEAD | \
- HFI1_CAP_EXTENDED_PSN | \
- HFI1_CAP_PRINT_UNIMPL | \
- HFI1_CAP_NO_INTEGRITY | \
- HFI1_CAP_PKEY_CHECK) << \
- HFI1_CAP_USER_SHIFT)
+ HFI1_CAP_USE_SDMA_HEAD | \
+ HFI1_CAP_EXTENDED_PSN | \
+ HFI1_CAP_PRINT_UNIMPL | \
+ HFI1_CAP_NO_INTEGRITY | \
+ HFI1_CAP_PKEY_CHECK | \
+ HFI1_CAP_TID_RDMA | \
+ HFI1_CAP_OPFN) << \
+ HFI1_CAP_USER_SHIFT)
/*
* Set of capabilities that need to be enabled for kernel context in
* order to be allowed for user contexts, as well.
@@ -283,7 +286,7 @@
#define RHF_TID_ERR (0x1ull << 59)
#define RHF_LEN_ERR (0x1ull << 60)
#define RHF_ECC_ERR (0x1ull << 61)
-#define RHF_VCRC_ERR (0x1ull << 62)
+#define RHF_RESERVED (0x1ull << 62)
#define RHF_ICRC_ERR (0x1ull << 63)
#define RHF_ERROR_SMASK 0xffe0000000000000ull /* bits 63:53 */
@@ -337,6 +340,10 @@
#define HFI1_PSM_IOC_BASE_SEQ 0x0
+/* Number of BTH.PSN bits used for sequence number in expected rcvs */
+#define HFI1_KDETH_BTH_SEQ_SHIFT 11
+#define HFI1_KDETH_BTH_SEQ_MASK (BIT(HFI1_KDETH_BTH_SEQ_SHIFT) - 1)
+
static inline __u64 rhf_to_cpu(const __le32 *rbuf)
{
return __le64_to_cpu(*((__le64 *)rbuf));
diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c
index 9f992ae..d268bf9 100644
--- a/drivers/infiniband/hw/hfi1/debugfs.c
+++ b/drivers/infiniband/hw/hfi1/debugfs.c
@@ -407,6 +407,54 @@
DEBUGFS_SEQ_FILE_OPEN(rcds)
DEBUGFS_FILE_OPS(rcds);
+static void *_pios_seq_start(struct seq_file *s, loff_t *pos)
+{
+ struct hfi1_ibdev *ibd;
+ struct hfi1_devdata *dd;
+
+ ibd = (struct hfi1_ibdev *)s->private;
+ dd = dd_from_dev(ibd);
+ if (!dd->send_contexts || *pos >= dd->num_send_contexts)
+ return NULL;
+ return pos;
+}
+
+static void *_pios_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+
+ ++*pos;
+ if (!dd->send_contexts || *pos >= dd->num_send_contexts)
+ return NULL;
+ return pos;
+}
+
+static void _pios_seq_stop(struct seq_file *s, void *v)
+{
+}
+
+static int _pios_seq_show(struct seq_file *s, void *v)
+{
+ struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private;
+ struct hfi1_devdata *dd = dd_from_dev(ibd);
+ struct send_context_info *sci;
+ loff_t *spos = v;
+ loff_t i = *spos;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->sc_lock, flags);
+ sci = &dd->send_contexts[i];
+ if (sci && sci->type != SC_USER && sci->allocated && sci->sc)
+ seqfile_dump_sci(s, i, sci);
+ spin_unlock_irqrestore(&dd->sc_lock, flags);
+ return 0;
+}
+
+DEBUGFS_SEQ_FILE_OPS(pios);
+DEBUGFS_SEQ_FILE_OPEN(pios)
+DEBUGFS_FILE_OPS(pios);
+
/* read the per-device counters */
static ssize_t dev_counters_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
@@ -939,9 +987,6 @@
struct hfi1_pportdata *ppd;
int ret;
- if (!try_module_get(THIS_MODULE))
- return -ENODEV;
-
ppd = private2ppd(fp);
ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0);
@@ -1032,10 +1077,82 @@
return __qsfp_debugfs_release(in, fp, 1);
}
+#define EXPROM_WRITE_ENABLE BIT_ULL(14)
+
+static bool exprom_wp_disabled;
+
+static int exprom_wp_set(struct hfi1_devdata *dd, bool disable)
+{
+ u64 gpio_val = 0;
+
+ if (disable) {
+ gpio_val = EXPROM_WRITE_ENABLE;
+ exprom_wp_disabled = true;
+ dd_dev_info(dd, "Disable Expansion ROM Write Protection\n");
+ } else {
+ exprom_wp_disabled = false;
+ dd_dev_info(dd, "Enable Expansion ROM Write Protection\n");
+ }
+
+ write_csr(dd, ASIC_GPIO_OUT, gpio_val);
+ write_csr(dd, ASIC_GPIO_OE, gpio_val);
+
+ return 0;
+}
+
+static ssize_t exprom_wp_debugfs_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return 0;
+}
+
+static ssize_t exprom_wp_debugfs_write(struct file *file,
+ const char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct hfi1_pportdata *ppd = private2ppd(file);
+ char cdata;
+
+ if (count != 1)
+ return -EINVAL;
+ if (get_user(cdata, buf))
+ return -EFAULT;
+ if (cdata == '0')
+ exprom_wp_set(ppd->dd, false);
+ else if (cdata == '1')
+ exprom_wp_set(ppd->dd, true);
+ else
+ return -EINVAL;
+
+ return 1;
+}
+
+static unsigned long exprom_in_use;
+
+static int exprom_wp_debugfs_open(struct inode *in, struct file *fp)
+{
+ if (test_and_set_bit(0, &exprom_in_use))
+ return -EBUSY;
+
+ return 0;
+}
+
+static int exprom_wp_debugfs_release(struct inode *in, struct file *fp)
+{
+ struct hfi1_pportdata *ppd = private2ppd(fp);
+
+ if (exprom_wp_disabled)
+ exprom_wp_set(ppd->dd, false);
+ clear_bit(0, &exprom_in_use);
+
+ return 0;
+}
+
#define DEBUGFS_OPS(nm, readroutine, writeroutine) \
{ \
.name = nm, \
.ops = { \
+ .owner = THIS_MODULE, \
.read = readroutine, \
.write = writeroutine, \
.llseek = generic_file_llseek, \
@@ -1046,6 +1163,7 @@
{ \
.name = nm, \
.ops = { \
+ .owner = THIS_MODULE, \
.read = readf, \
.write = writef, \
.llseek = generic_file_llseek, \
@@ -1071,6 +1189,9 @@
qsfp1_debugfs_open, qsfp1_debugfs_release),
DEBUGFS_XOPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write,
qsfp2_debugfs_open, qsfp2_debugfs_release),
+ DEBUGFS_XOPS("exprom_wp", exprom_wp_debugfs_read,
+ exprom_wp_debugfs_write, exprom_wp_debugfs_open,
+ exprom_wp_debugfs_release),
DEBUGFS_OPS("asic_flags", asic_flags_read, asic_flags_write),
DEBUGFS_OPS("dc8051_memory", dc8051_memory_read, NULL),
DEBUGFS_OPS("lcb", debugfs_lcb_read, debugfs_lcb_write),
@@ -1119,6 +1240,7 @@
char link[10];
struct hfi1_devdata *dd = dd_from_dev(ibd);
struct hfi1_pportdata *ppd;
+ struct dentry *root;
int unit = dd->unit;
int i, j;
@@ -1126,30 +1248,29 @@
return;
snprintf(name, sizeof(name), "%s_%d", class_name(), unit);
snprintf(link, sizeof(link), "%d", unit);
- ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root);
- if (!ibd->hfi1_ibdev_dbg) {
- pr_warn("create of %s failed\n", name);
- return;
- }
+ root = debugfs_create_dir(name, hfi1_dbg_root);
+ ibd->hfi1_ibdev_dbg = root;
+
ibd->hfi1_ibdev_link =
debugfs_create_symlink(link, hfi1_dbg_root, name);
- if (!ibd->hfi1_ibdev_link) {
- pr_warn("create of %s symlink failed\n", name);
- return;
- }
- DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(tx_opcode_stats, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(rcds, ibd->hfi1_ibdev_dbg, ibd);
- DEBUGFS_SEQ_FILE_CREATE(sdma_cpu_list, ibd->hfi1_ibdev_dbg, ibd);
+
+ debugfs_create_file("opcode_stats", 0444, root, ibd,
+ &_opcode_stats_file_ops);
+ debugfs_create_file("tx_opcode_stats", 0444, root, ibd,
+ &_tx_opcode_stats_file_ops);
+ debugfs_create_file("ctx_stats", 0444, root, ibd, &_ctx_stats_file_ops);
+ debugfs_create_file("qp_stats", 0444, root, ibd, &_qp_stats_file_ops);
+ debugfs_create_file("sdes", 0444, root, ibd, &_sdes_file_ops);
+ debugfs_create_file("rcds", 0444, root, ibd, &_rcds_file_ops);
+ debugfs_create_file("pios", 0444, root, ibd, &_pios_file_ops);
+ debugfs_create_file("sdma_cpu_list", 0444, root, ibd,
+ &_sdma_cpu_list_file_ops);
+
/* dev counter files */
for (i = 0; i < ARRAY_SIZE(cntr_ops); i++)
- DEBUGFS_FILE_CREATE(cntr_ops[i].name,
- ibd->hfi1_ibdev_dbg,
- dd,
- &cntr_ops[i].ops, S_IRUGO);
+ debugfs_create_file(cntr_ops[i].name, 0444, root, dd,
+ &cntr_ops[i].ops);
+
/* per port files */
for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++)
for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) {
@@ -1157,12 +1278,11 @@
sizeof(name),
port_cntr_ops[i].name,
j + 1);
- DEBUGFS_FILE_CREATE(name,
- ibd->hfi1_ibdev_dbg,
- ppd,
- &port_cntr_ops[i].ops,
+ debugfs_create_file(name,
!port_cntr_ops[i].ops.write ?
- S_IRUGO : S_IRUGO | S_IWUSR);
+ S_IRUGO :
+ S_IRUGO | S_IWUSR,
+ root, ppd, &port_cntr_ops[i].ops);
}
hfi1_fault_init_debugfs(ibd);
@@ -1255,15 +1375,15 @@
static u64 hfi1_sps_ints(void)
{
- unsigned long flags;
+ unsigned long index, flags;
struct hfi1_devdata *dd;
u64 sps_ints = 0;
- spin_lock_irqsave(&hfi1_devs_lock, flags);
- list_for_each_entry(dd, &hfi1_dev_list, list) {
+ xa_lock_irqsave(&hfi1_dev_table, flags);
+ xa_for_each(&hfi1_dev_table, index, dd) {
sps_ints += get_all_cpu_total(dd->int_counter);
}
- spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+ xa_unlock_irqrestore(&hfi1_dev_table, flags);
return sps_ints;
}
@@ -1292,10 +1412,10 @@
void hfi1_dbg_init(void)
{
hfi1_dbg_root = debugfs_create_dir(DRIVER_NAME, NULL);
- if (!hfi1_dbg_root)
- pr_warn("init of debugfs failed\n");
- DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL);
- DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL);
+ debugfs_create_file("driver_stats_names", 0444, hfi1_dbg_root, NULL,
+ &_driver_stats_names_file_ops);
+ debugfs_create_file("driver_stats", 0444, hfi1_dbg_root, NULL,
+ &_driver_stats_file_ops);
}
void hfi1_dbg_exit(void)
diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h
index d5d8244..57e582c 100644
--- a/drivers/infiniband/hw/hfi1/debugfs.h
+++ b/drivers/infiniband/hw/hfi1/debugfs.h
@@ -49,16 +49,6 @@
struct hfi1_ibdev;
-#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode) \
-do { \
- struct dentry *ent; \
- const char *__name = name; \
- ent = debugfs_create_file(__name, mode, parent, \
- data, ops); \
- if (!ent) \
- pr_warn("create of %s failed\n", __name); \
-} while (0)
-
#define DEBUGFS_SEQ_FILE_OPS(name) \
static const struct seq_operations _##name##_seq_ops = { \
.start = _##name##_seq_start, \
@@ -89,8 +79,6 @@
.release = seq_release \
}
-#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \
- DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, 0444)
ssize_t hfi1_seq_read(struct file *file, char __user *buf, size_t size,
loff_t *ppos);
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
index a41f855..01aa1f1 100644
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -72,8 +72,6 @@
*/
const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n";
-DEFINE_SPINLOCK(hfi1_devs_lock);
-LIST_HEAD(hfi1_dev_list);
DEFINE_MUTEX(hfi1_mutex); /* general driver use */
unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
@@ -175,11 +173,11 @@
{
struct hfi1_devdata *dd;
struct hfi1_pportdata *ppd;
- unsigned long flags;
+ unsigned long index, flags;
int pidx, nunits_active = 0;
- spin_lock_irqsave(&hfi1_devs_lock, flags);
- list_for_each_entry(dd, &hfi1_dev_list, list) {
+ xa_lock_irqsave(&hfi1_dev_table, flags);
+ xa_for_each(&hfi1_dev_table, index, dd) {
if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase1)
continue;
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
@@ -190,7 +188,7 @@
}
}
}
- spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+ xa_unlock_irqrestore(&hfi1_dev_table, flags);
return nunits_active;
}
@@ -264,7 +262,7 @@
hfi1_dbg_fault_suppress_err(verbs_dev))
return;
- if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
+ if (packet->rhf & RHF_ICRC_ERR)
return;
if (packet->etype == RHF_RCV_TYPE_BYPASS) {
@@ -430,40 +428,60 @@
[HFI1_PKT_TYPE_16B] = &return_cnp_16B
};
-void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
- bool do_cnp)
+/**
+ * hfi1_process_ecn_slowpath - Process FECN or BECN bits
+ * @qp: The packet's destination QP
+ * @pkt: The packet itself.
+ * @prescan: Is the caller the RXQ prescan
+ *
+ * Process the packet's FECN or BECN bits. By now, the packet
+ * has already been evaluated whether processing of those bit should
+ * be done.
+ * The significance of the @prescan argument is that if the caller
+ * is the RXQ prescan, a CNP will be send out instead of waiting for the
+ * normal packet processing to send an ACK with BECN set (or a CNP).
+ */
+bool hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
+ bool prescan)
{
struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
struct ib_other_headers *ohdr = pkt->ohdr;
struct ib_grh *grh = pkt->grh;
- u32 rqpn = 0, bth1;
+ u32 rqpn = 0;
u16 pkey;
u32 rlid, slid, dlid = 0;
- u8 hdr_type, sc, svc_type;
- bool is_mcast = false;
+ u8 hdr_type, sc, svc_type, opcode;
+ bool is_mcast = false, ignore_fecn = false, do_cnp = false,
+ fecn, becn;
/* can be called from prescan */
if (pkt->etype == RHF_RCV_TYPE_BYPASS) {
- is_mcast = hfi1_is_16B_mcast(dlid);
pkey = hfi1_16B_get_pkey(pkt->hdr);
sc = hfi1_16B_get_sc(pkt->hdr);
dlid = hfi1_16B_get_dlid(pkt->hdr);
slid = hfi1_16B_get_slid(pkt->hdr);
+ is_mcast = hfi1_is_16B_mcast(dlid);
+ opcode = ib_bth_get_opcode(ohdr);
hdr_type = HFI1_PKT_TYPE_16B;
+ fecn = hfi1_16B_get_fecn(pkt->hdr);
+ becn = hfi1_16B_get_becn(pkt->hdr);
} else {
- is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
- (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
pkey = ib_bth_get_pkey(ohdr);
sc = hfi1_9B_get_sc5(pkt->hdr, pkt->rhf);
- dlid = ib_get_dlid(pkt->hdr);
+ dlid = qp->ibqp.qp_type != IB_QPT_UD ? ib_get_dlid(pkt->hdr) :
+ ppd->lid;
slid = ib_get_slid(pkt->hdr);
+ is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) &&
+ (dlid != be16_to_cpu(IB_LID_PERMISSIVE));
+ opcode = ib_bth_get_opcode(ohdr);
hdr_type = HFI1_PKT_TYPE_9B;
+ fecn = ib_bth_get_fecn(ohdr);
+ becn = ib_bth_get_becn(ohdr);
}
switch (qp->ibqp.qp_type) {
case IB_QPT_UD:
- dlid = ppd->lid;
rlid = slid;
rqpn = ib_get_sqpn(pkt->ohdr);
svc_type = IB_CC_SVCTYPE_UD;
@@ -485,22 +503,33 @@
svc_type = IB_CC_SVCTYPE_RC;
break;
default:
- return;
+ return false;
}
- bth1 = be32_to_cpu(ohdr->bth[1]);
+ ignore_fecn = is_mcast || (opcode == IB_OPCODE_CNP) ||
+ (opcode == IB_OPCODE_RC_ACKNOWLEDGE);
+ /*
+ * ACKNOWLEDGE packets do not get a CNP but this will be
+ * guarded by ignore_fecn above.
+ */
+ do_cnp = prescan ||
+ (opcode >= IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST &&
+ opcode <= IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE) ||
+ opcode == TID_OP(READ_RESP) ||
+ opcode == TID_OP(ACK);
+
/* Call appropriate CNP handler */
- if (do_cnp && (bth1 & IB_FECN_SMASK))
+ if (!ignore_fecn && do_cnp && fecn)
hfi1_handle_cnp_tbl[hdr_type](ibp, qp, rqpn, pkey,
dlid, rlid, sc, grh);
- if (!is_mcast && (bth1 & IB_BECN_SMASK)) {
- u32 lqpn = bth1 & RVT_QPN_MASK;
+ if (becn) {
+ u32 lqpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK;
u8 sl = ibp->sc_to_sl[sc];
process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type);
}
-
+ return !ignore_fecn && fecn;
}
struct ps_mdata {
@@ -599,7 +628,6 @@
struct rvt_dev_info *rdi = &rcd->dd->verbs_dev.rdi;
u64 rhf = rhf_to_cpu(rhf_addr);
u32 etype = rhf_rcv_type(rhf), qpn, bth1;
- int is_ecn = 0;
u8 lnh;
if (ps_done(&mdata, rhf, rcd))
@@ -625,12 +653,10 @@
goto next; /* just in case */
}
- bth1 = be32_to_cpu(packet->ohdr->bth[1]);
- is_ecn = !!(bth1 & (IB_FECN_SMASK | IB_BECN_SMASK));
-
- if (!is_ecn)
+ if (!hfi1_may_ecn(packet))
goto next;
+ bth1 = be32_to_cpu(packet->ohdr->bth[1]);
qpn = bth1 & RVT_QPN_MASK;
rcu_read_lock();
qp = rvt_lookup_qpn(rdi, &ibp->rvp, qpn);
@@ -640,7 +666,7 @@
goto next;
}
- process_ecn(qp, packet, true);
+ hfi1_process_ecn_slowpath(qp, packet, true);
rcu_read_unlock();
/* turn off BECN, FECN */
@@ -1400,7 +1426,7 @@
if ((!(hfi1_is_16B_mcast(packet->dlid))) &&
(packet->dlid !=
opa_get_lid(be32_to_cpu(OPA_LID_PERMISSIVE), 16B))) {
- if (packet->dlid != ppd->lid)
+ if ((packet->dlid & ~((1 << ppd->lmc) - 1)) != ppd->lid)
return -EINVAL;
}
@@ -1549,25 +1575,31 @@
return -EINVAL;
}
-void handle_eflags(struct hfi1_packet *packet)
+static void show_eflags_errs(struct hfi1_packet *packet)
{
struct hfi1_ctxtdata *rcd = packet->rcd;
u32 rte = rhf_rcv_type_err(packet->rhf);
+ dd_dev_err(rcd->dd,
+ "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s] rte 0x%x\n",
+ rcd->ctxt, packet->rhf,
+ packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
+ packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
+ packet->rhf & RHF_DC_ERR ? "dc " : "",
+ packet->rhf & RHF_TID_ERR ? "tid " : "",
+ packet->rhf & RHF_LEN_ERR ? "len " : "",
+ packet->rhf & RHF_ECC_ERR ? "ecc " : "",
+ packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
+ rte);
+}
+
+void handle_eflags(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+
rcv_hdrerr(rcd, rcd->ppd, packet);
if (rhf_err_flags(packet->rhf))
- dd_dev_err(rcd->dd,
- "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
- rcd->ctxt, packet->rhf,
- packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
- packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
- packet->rhf & RHF_DC_ERR ? "dc " : "",
- packet->rhf & RHF_TID_ERR ? "tid " : "",
- packet->rhf & RHF_LEN_ERR ? "len " : "",
- packet->rhf & RHF_ECC_ERR ? "ecc " : "",
- packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
- packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
- rte);
+ show_eflags_errs(packet);
}
/*
@@ -1673,11 +1705,14 @@
if (unlikely(hfi1_dbg_should_fault_rx(packet)))
return RHF_RCV_CONTINUE;
- if (unlikely(rhf_err_flags(packet->rhf)))
- handle_eflags(packet);
+ if (unlikely(rhf_err_flags(packet->rhf))) {
+ struct hfi1_ctxtdata *rcd = packet->rcd;
- dd_dev_err(packet->rcd->dd,
- "Unhandled expected packet received. Dropping.\n");
+ if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
+ return RHF_RCV_CONTINUE;
+ }
+
+ hfi1_kdeth_expected_rcv(packet);
return RHF_RCV_CONTINUE;
}
@@ -1686,11 +1721,17 @@
hfi1_setup_9B_packet(packet);
if (unlikely(hfi1_dbg_should_fault_rx(packet)))
return RHF_RCV_CONTINUE;
- if (unlikely(rhf_err_flags(packet->rhf)))
- handle_eflags(packet);
- dd_dev_err(packet->rcd->dd,
- "Unhandled eager packet received. Dropping.\n");
+ trace_hfi1_rcvhdr(packet);
+ if (unlikely(rhf_err_flags(packet->rhf))) {
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+
+ show_eflags_errs(packet);
+ if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
+ return RHF_RCV_CONTINUE;
+ }
+
+ hfi1_kdeth_eager_rcv(packet);
return RHF_RCV_CONTINUE;
}
diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.c b/drivers/infiniband/hw/hfi1/exp_rcv.c
index 1be49a0..e9d5cc8 100644
--- a/drivers/infiniband/hw/hfi1/exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/exp_rcv.c
@@ -112,9 +112,6 @@
*/
void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd)
{
- WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_full_list));
- WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_used_list));
-
kfree(rcd->groups);
rcd->groups = NULL;
hfi1_exp_tid_group_init(rcd);
diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c
index e2290f3..986c121 100644
--- a/drivers/infiniband/hw/hfi1/fault.c
+++ b/drivers/infiniband/hw/hfi1/fault.c
@@ -141,18 +141,21 @@
if (!data)
return -ENOMEM;
copy = min(len, datalen - 1);
- if (copy_from_user(data, buf, copy))
- return -EFAULT;
+ if (copy_from_user(data, buf, copy)) {
+ ret = -EFAULT;
+ goto free_data;
+ }
ret = debugfs_file_get(file->f_path.dentry);
if (unlikely(ret))
- return ret;
+ goto free_data;
ptr = data;
token = ptr;
for (ptr = data; *ptr; ptr = end + 1, token = ptr) {
char *dash;
unsigned long range_start, range_end, i;
bool remove = false;
+ unsigned long bound = 1U << BITS_PER_BYTE;
end = strchr(ptr, ',');
if (end)
@@ -178,6 +181,10 @@
BITS_PER_BYTE);
break;
}
+ /* Check the inputs */
+ if (range_start >= bound || range_end >= bound)
+ break;
+
for (i = range_start; i <= range_end; i++) {
if (remove)
clear_bit(i, fault->opcodes);
@@ -190,6 +197,7 @@
ret = len;
debugfs_file_put(file->f_path.dentry);
+free_data:
kfree(data);
return ret;
}
@@ -209,7 +217,7 @@
return -ENOMEM;
ret = debugfs_file_get(file->f_path.dentry);
if (unlikely(ret))
- return ret;
+ goto free_data;
bit = find_first_bit(fault->opcodes, bitsize);
while (bit < bitsize) {
zero = find_next_zero_bit(fault->opcodes, bitsize, bit);
@@ -227,6 +235,7 @@
data[size - 1] = '\n';
data[size] = '\0';
ret = simple_read_from_buffer(buf, len, pos, data, size);
+free_data:
kfree(data);
return ret;
}
@@ -250,6 +259,7 @@
int hfi1_fault_init_debugfs(struct hfi1_ibdev *ibd)
{
struct dentry *parent = ibd->hfi1_ibdev_dbg;
+ struct dentry *fault_dir;
ibd->fault = kzalloc(sizeof(*ibd->fault), GFP_KERNEL);
if (!ibd->fault)
@@ -269,45 +279,31 @@
bitmap_zero(ibd->fault->opcodes,
sizeof(ibd->fault->opcodes) * BITS_PER_BYTE);
- ibd->fault->dir =
- fault_create_debugfs_attr("fault", parent,
- &ibd->fault->attr);
- if (IS_ERR(ibd->fault->dir)) {
+ fault_dir =
+ fault_create_debugfs_attr("fault", parent, &ibd->fault->attr);
+ if (IS_ERR(fault_dir)) {
kfree(ibd->fault);
ibd->fault = NULL;
return -ENOENT;
}
+ ibd->fault->dir = fault_dir;
- DEBUGFS_SEQ_FILE_CREATE(fault_stats, ibd->fault->dir, ibd);
- if (!debugfs_create_bool("enable", 0600, ibd->fault->dir,
- &ibd->fault->enable))
- goto fail;
- if (!debugfs_create_bool("suppress_err", 0600,
- ibd->fault->dir,
- &ibd->fault->suppress_err))
- goto fail;
- if (!debugfs_create_bool("opcode_mode", 0600, ibd->fault->dir,
- &ibd->fault->opcode))
- goto fail;
- if (!debugfs_create_file("opcodes", 0600, ibd->fault->dir,
- ibd->fault, &__fault_opcodes_fops))
- goto fail;
- if (!debugfs_create_u64("skip_pkts", 0600,
- ibd->fault->dir,
- &ibd->fault->fault_skip))
- goto fail;
- if (!debugfs_create_u64("skip_usec", 0600,
- ibd->fault->dir,
- &ibd->fault->fault_skip_usec))
- goto fail;
- if (!debugfs_create_u8("direction", 0600, ibd->fault->dir,
- &ibd->fault->direction))
- goto fail;
+ debugfs_create_file("fault_stats", 0444, fault_dir, ibd,
+ &_fault_stats_file_ops);
+ debugfs_create_bool("enable", 0600, fault_dir, &ibd->fault->enable);
+ debugfs_create_bool("suppress_err", 0600, fault_dir,
+ &ibd->fault->suppress_err);
+ debugfs_create_bool("opcode_mode", 0600, fault_dir,
+ &ibd->fault->opcode);
+ debugfs_create_file("opcodes", 0600, fault_dir, ibd->fault,
+ &__fault_opcodes_fops);
+ debugfs_create_u64("skip_pkts", 0600, fault_dir,
+ &ibd->fault->fault_skip);
+ debugfs_create_u64("skip_usec", 0600, fault_dir,
+ &ibd->fault->fault_skip_usec);
+ debugfs_create_u8("direction", 0600, fault_dir, &ibd->fault->direction);
return 0;
-fail:
- hfi1_fault_exit_debugfs(ibd);
- return -ENOMEM;
}
bool hfi1_dbg_fault_suppress_err(struct hfi1_ibdev *ibd)
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index 1fc7564..f9a7e9d 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -488,7 +488,7 @@
vmf = 1;
break;
case STATUS:
- if (flags & (unsigned long)(VM_WRITE | VM_EXEC)) {
+ if (flags & VM_WRITE) {
ret = -EPERM;
goto done;
}
@@ -681,7 +681,8 @@
HFI1_RCVCTRL_TAILUPD_DIS |
HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
- HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt);
+ HFI1_RCVCTRL_NO_EGR_DROP_DIS |
+ HFI1_RCVCTRL_URGENT_DIS, uctxt);
/* Clear the context's J_KEY */
hfi1_clear_ctxt_jkey(dd, uctxt);
/*
@@ -1096,6 +1097,7 @@
hfi1_set_ctxt_jkey(uctxt->dd, uctxt, uctxt->jkey);
rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
+ rcvctrl_ops |= HFI1_RCVCTRL_URGENT_ENB;
if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP))
rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
/*
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index cfd2523..fa45350 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -54,7 +54,6 @@
#include <linux/list.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
-#include <linux/idr.h>
#include <linux/io.h>
#include <linux/fs.h>
#include <linux/completion.h>
@@ -65,6 +64,7 @@
#include <linux/kthread.h>
#include <linux/i2c.h>
#include <linux/i2c-algo-bit.h>
+#include <linux/xarray.h>
#include <rdma/ib_hdrs.h>
#include <rdma/opa_addr.h>
#include <linux/rhashtable.h>
@@ -73,6 +73,7 @@
#include "chip_registers.h"
#include "common.h"
+#include "opfn.h"
#include "verbs.h"
#include "pio.h"
#include "chip.h"
@@ -80,6 +81,7 @@
#include "qsfp.h"
#include "platform.h"
#include "affinity.h"
+#include "msix.h"
/* bumped 1 from s/w major version of TrueScale */
#define HFI1_CHIP_VERS_MAJ 3U
@@ -97,6 +99,8 @@
#define NEIGHBOR_TYPE_HFI 0
#define NEIGHBOR_TYPE_SWITCH 1
+#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
+
extern unsigned long hfi1_cap_mask;
#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap)
#define HFI1_CAP_UGET_MASK(mask, cap) \
@@ -194,6 +198,14 @@
};
typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
+
+struct tid_queue {
+ struct list_head queue_head;
+ /* queue head for QP TID resource waiters */
+ u32 enqueue; /* count of tid enqueues */
+ u32 dequeue; /* count of tid dequeues */
+};
+
struct hfi1_ctxtdata {
/* rcvhdrq base, needs mmap before useful */
void *rcvhdrq;
@@ -287,6 +299,12 @@
/* PSM Specific fields */
/* lock protecting all Expected TID data */
struct mutex exp_mutex;
+ /* lock protecting all Expected TID data of kernel contexts */
+ spinlock_t exp_lock;
+ /* Queue for QP's waiting for HW TID flows */
+ struct tid_queue flow_queue;
+ /* Queue for QP's waiting for HW receive array entries */
+ struct tid_queue rarr_queue;
/* when waiting for rcv or pioavail */
wait_queue_head_t wait;
/* uuid from PSM */
@@ -319,6 +337,9 @@
*/
u8 subctxt_cnt;
+ /* Bit mask to track free TID RDMA HW flows */
+ unsigned long flow_mask;
+ struct tid_flow_state flows[RXE_NUM_TID_FLOWS];
};
/**
@@ -518,6 +539,37 @@
mgmt->src_qpn = cpu_to_be32(src_qp & OPA_16B_MGMT_QPN_MASK);
}
+/**
+ * hfi1_get_rc_ohdr - get extended header
+ * @opah - the opaheader
+ */
+static inline struct ib_other_headers *
+hfi1_get_rc_ohdr(struct hfi1_opa_header *opah)
+{
+ struct ib_other_headers *ohdr;
+ struct ib_header *hdr = NULL;
+ struct hfi1_16b_header *hdr_16b = NULL;
+
+ /* Find out where the BTH is */
+ if (opah->hdr_type == HFI1_PKT_TYPE_9B) {
+ hdr = &opah->ibh;
+ if (ib_get_lnh(hdr) == HFI1_LRH_BTH)
+ ohdr = &hdr->u.oth;
+ else
+ ohdr = &hdr->u.l.oth;
+ } else {
+ u8 l4;
+
+ hdr_16b = &opah->opah;
+ l4 = hfi1_16B_get_l4(hdr_16b);
+ if (l4 == OPA_16B_L4_IB_LOCAL)
+ ohdr = &hdr_16b->u.oth;
+ else
+ ohdr = &hdr_16b->u.l.oth;
+ }
+ return ohdr;
+}
+
struct rvt_sge_state;
/*
@@ -622,6 +674,8 @@
#define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000
#define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000
#define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000
+#define HFI1_RCVCTRL_URGENT_ENB 0x40000
+#define HFI1_RCVCTRL_URGENT_DIS 0x80000
/* partition enforcement flags */
#define HFI1_PART_ENFORCE_IN 0x1
@@ -669,6 +723,14 @@
struct irq_affinity_notify notify;
};
+struct hfi1_msix_info {
+ /* lock to synchronize in_use_msix access */
+ spinlock_t msix_lock;
+ DECLARE_BITMAP(in_use_msix, CCE_NUM_MSIX_VECTORS);
+ struct hfi1_msix_entry *msix_entries;
+ u16 max_requested;
+};
+
/* per-SL CCA information */
struct cca_timer {
struct hrtimer hrtimer;
@@ -990,11 +1052,10 @@
struct hfi1_vnic_data {
struct hfi1_ctxtdata *ctxt[HFI1_NUM_VNIC_CTXT];
struct kmem_cache *txreq_cache;
+ struct xarray vesws;
u8 num_vports;
- struct idr vesw_idr;
u8 rmt_start;
u8 num_ctxt;
- u32 msix_idx;
};
struct hfi1_vnic_vport_info;
@@ -1011,7 +1072,6 @@
typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64);
struct hfi1_devdata {
struct hfi1_ibdev verbs_dev; /* must be first */
- struct list_head list;
/* pointers to related structs for this device */
/* pci access data structure */
struct pci_dev *pcidev;
@@ -1207,11 +1267,6 @@
struct diag_client *diag_client;
- /* MSI-X information */
- struct hfi1_msix_entry *msix_entries;
- u32 num_msix_entries;
- u32 first_dyn_msix_idx;
-
/* general interrupt: mask of handled interrupts */
u64 gi_mask[CCE_NUM_INT_CSRS];
@@ -1225,6 +1280,9 @@
*/
struct timer_list synth_stats_timer;
+ /* MSI-X information */
+ struct hfi1_msix_info msix_info;
+
/*
* device counters
*/
@@ -1351,6 +1409,8 @@
/* vnic data */
struct hfi1_vnic_data vnic;
+ /* Lock to protect IRQ SRC register access */
+ spinlock_t irq_src_lock;
};
static inline bool hfi1_vnic_is_rsm_full(struct hfi1_devdata *dd, int spare)
@@ -1396,8 +1456,7 @@
struct mm_struct *mm;
};
-extern struct list_head hfi1_dev_list;
-extern spinlock_t hfi1_devs_lock;
+extern struct xarray hfi1_dev_table;
struct hfi1_devdata *hfi1_lookup(int unit);
static inline unsigned long uctxt_offset(struct hfi1_ctxtdata *uctxt)
@@ -1425,7 +1484,7 @@
struct hfi1_devdata *dd, u8 hw_pidx, u8 port);
void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd);
int hfi1_rcd_put(struct hfi1_ctxtdata *rcd);
-void hfi1_rcd_get(struct hfi1_ctxtdata *rcd);
+int hfi1_rcd_get(struct hfi1_ctxtdata *rcd);
struct hfi1_ctxtdata *hfi1_rcd_get_by_index_safe(struct hfi1_devdata *dd,
u16 ctxt);
struct hfi1_ctxtdata *hfi1_rcd_get_by_index(struct hfi1_devdata *dd, u16 ctxt);
@@ -1433,9 +1492,6 @@
int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread);
int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread);
void set_all_slowpath(struct hfi1_devdata *dd);
-void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd);
-void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd);
-void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd);
extern const struct pci_device_id hfi1_pci_tbl[];
void hfi1_make_ud_req_9B(struct rvt_qp *qp,
@@ -1797,13 +1853,20 @@
return &rcd->ppd->ibport_data;
}
-void hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
- bool do_cnp);
-static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt,
- bool do_cnp)
+/**
+ * hfi1_may_ecn - Check whether FECN or BECN processing should be done
+ * @pkt: the packet to be evaluated
+ *
+ * Check whether the FECN or BECN bits in the packet's header are
+ * enabled, depending on packet type.
+ *
+ * This function only checks for FECN and BECN bits. Additional checks
+ * are done in the slowpath (hfi1_process_ecn_slowpath()) in order to
+ * ensure correct handling.
+ */
+static inline bool hfi1_may_ecn(struct hfi1_packet *pkt)
{
- bool becn;
- bool fecn;
+ bool fecn, becn;
if (pkt->etype == RHF_RCV_TYPE_BYPASS) {
fecn = hfi1_16B_get_fecn(pkt->hdr);
@@ -1812,10 +1875,18 @@
fecn = ib_bth_get_fecn(pkt->ohdr);
becn = ib_bth_get_becn(pkt->ohdr);
}
- if (unlikely(fecn || becn)) {
- hfi1_process_ecn_slowpath(qp, pkt, do_cnp);
- return fecn;
- }
+ return fecn || becn;
+}
+
+bool hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
+ bool prescan);
+static inline bool process_ecn(struct rvt_qp *qp, struct hfi1_packet *pkt)
+{
+ bool do_work;
+
+ do_work = hfi1_may_ecn(pkt);
+ if (unlikely(do_work))
+ return hfi1_process_ecn_slowpath(qp, pkt, false);
return false;
}
@@ -1889,10 +1960,8 @@
#define HFI1_CTXT_WAITING_URG 4
/* free up any allocated data at closes */
-struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
- const struct pci_device_id *ent);
+int hfi1_init_dd(struct hfi1_devdata *dd);
void hfi1_free_devdata(struct hfi1_devdata *dd);
-struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra);
/* LED beaconing functions */
void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon,
@@ -1965,6 +2034,7 @@
*/
extern const char ib_hfi1_version[];
+extern const struct attribute_group ib_hfi1_attr_group;
int hfi1_device_create(struct hfi1_devdata *dd);
void hfi1_device_remove(struct hfi1_devdata *dd);
@@ -1976,16 +2046,15 @@
/* Hook for sysfs read of QSFP */
int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len);
-int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent);
-void hfi1_clean_up_interrupts(struct hfi1_devdata *dd);
+int hfi1_pcie_init(struct hfi1_devdata *dd);
void hfi1_pcie_cleanup(struct pci_dev *pdev);
int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev);
void hfi1_pcie_ddcleanup(struct hfi1_devdata *);
int pcie_speeds(struct hfi1_devdata *dd);
-int request_msix(struct hfi1_devdata *dd, u32 msireq);
int restore_pci_variables(struct hfi1_devdata *dd);
int save_pci_variables(struct hfi1_devdata *dd);
int do_pcie_gen3_transition(struct hfi1_devdata *dd);
+void tune_pcie_caps(struct hfi1_devdata *dd);
int parse_platform_config(struct hfi1_devdata *dd);
int get_platform_config_field(struct hfi1_devdata *dd,
enum platform_config_table_type_encoding
@@ -2080,7 +2149,7 @@
SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK |
#endif
HFI1_PKT_USER_SC_INTEGRITY;
- else
+ else if (ctxt_type != SC_KERNEL)
base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY;
/* turn on send-side job key checks if !A0 */
@@ -2126,19 +2195,6 @@
return base_sdma_integrity;
}
-/*
- * hfi1_early_err is used (only!) to print early errors before devdata is
- * allocated, or when dd->pcidev may not be valid, and at the tail end of
- * cleanup when devdata may have been freed, etc. hfi1_dev_porterr is
- * the same as dd_dev_err, but is used when the message really needs
- * the IB port# to be definitive as to what's happening..
- */
-#define hfi1_early_err(dev, fmt, ...) \
- dev_err(dev, fmt, ##__VA_ARGS__)
-
-#define hfi1_early_info(dev, fmt, ...) \
- dev_info(dev, fmt, ##__VA_ARGS__)
-
#define dd_dev_emerg(dd, fmt, ...) \
dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \
rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__)
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 758d273..26b792b 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -49,11 +49,12 @@
#include <linux/netdevice.h>
#include <linux/vmalloc.h>
#include <linux/delay.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
#include <linux/module.h>
#include <linux/printk.h>
#include <linux/hrtimer.h>
#include <linux/bitmap.h>
+#include <linux/numa.h>
#include <rdma/rdma_vt.h>
#include "hfi.h"
@@ -72,7 +73,6 @@
#undef pr_fmt
#define pr_fmt(fmt) DRIVER_NAME ": " fmt
-#define HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES 5
/*
* min buffers we want to have per context, after driver
*/
@@ -83,6 +83,8 @@
#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
+#define NUM_IB_PORTS 1
+
/*
* Number of user receive contexts we are configured to use (to allow for more
* pio buffers per ctxt, etc.) Zero means use one user context per CPU.
@@ -122,7 +124,7 @@
static inline u64 encode_rcv_header_entry_size(u16 size);
-static struct idr hfi1_unit_table;
+DEFINE_XARRAY_FLAGS(hfi1_dev_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
static int hfi1_create_kctxt(struct hfi1_devdata *dd,
struct hfi1_pportdata *ppd)
@@ -213,12 +215,12 @@
struct hfi1_ctxtdata *rcd =
container_of(kref, struct hfi1_ctxtdata, kref);
- hfi1_free_ctxtdata(rcd->dd, rcd);
-
spin_lock_irqsave(&rcd->dd->uctxt_lock, flags);
rcd->dd->rcd[rcd->ctxt] = NULL;
spin_unlock_irqrestore(&rcd->dd->uctxt_lock, flags);
+ hfi1_free_ctxtdata(rcd->dd, rcd);
+
kfree(rcd);
}
@@ -241,10 +243,13 @@
* @rcd: pointer to an initialized rcd data structure
*
* Use this to get a reference after the init.
+ *
+ * Return : reflect kref_get_unless_zero(), which returns non-zero on
+ * increment, otherwise 0.
*/
-void hfi1_rcd_get(struct hfi1_ctxtdata *rcd)
+int hfi1_rcd_get(struct hfi1_ctxtdata *rcd)
{
- kref_get(&rcd->kref);
+ return kref_get_unless_zero(&rcd->kref);
}
/**
@@ -324,7 +329,8 @@
spin_lock_irqsave(&dd->uctxt_lock, flags);
if (dd->rcd[ctxt]) {
rcd = dd->rcd[ctxt];
- hfi1_rcd_get(rcd);
+ if (!hfi1_rcd_get(rcd))
+ rcd = NULL;
}
spin_unlock_irqrestore(&dd->uctxt_lock, flags);
@@ -369,6 +375,9 @@
rcd->rhf_rcv_function_map = normal_rhf_rcv_functions;
mutex_init(&rcd->exp_mutex);
+ spin_lock_init(&rcd->exp_lock);
+ INIT_LIST_HEAD(&rcd->flow_queue.queue_head);
+ INIT_LIST_HEAD(&rcd->rarr_queue.queue_head);
hfi1_cdbg(PROC, "setting up context %u\n", rcd->ctxt);
@@ -460,7 +469,7 @@
if (rcd->egrbufs.size < hfi1_max_mtu) {
rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
hfi1_cdbg(PROC,
- "ctxt%u: eager bufs size too small. Adjusting to %zu\n",
+ "ctxt%u: eager bufs size too small. Adjusting to %u\n",
rcd->ctxt, rcd->egrbufs.size);
}
rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
@@ -471,6 +480,9 @@
GFP_KERNEL, numa);
if (!rcd->opstats)
goto bail;
+
+ /* Initialize TID flow generations for the context */
+ hfi1_kern_init_ctxt_generations(rcd);
}
*context = rcd;
@@ -654,9 +666,8 @@
ppd->part_enforce |= HFI1_PART_ENFORCE_IN;
if (loopback) {
- hfi1_early_err(&pdev->dev,
- "Faking data partition 0x8001 in idx %u\n",
- !default_pkey_idx);
+ dd_dev_err(dd, "Faking data partition 0x8001 in idx %u\n",
+ !default_pkey_idx);
ppd->pkeys[!default_pkey_idx] = 0x8001;
}
@@ -702,9 +713,7 @@
return;
bail:
-
- hfi1_early_err(&pdev->dev,
- "Congestion Control Agent disabled for port %d\n", port);
+ dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port);
}
/*
@@ -773,6 +782,8 @@
rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
if (HFI1_CAP_KGET_MASK(rcd->flags, NODROP_EGR_FULL))
rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+ if (HFI1_CAP_IS_KSET(TID_RDMA))
+ rcvmask |= HFI1_RCVCTRL_TIDFLOW_ENB;
hfi1_rcvctrl(dd, rcvmask, rcd);
sc_enable(rcd->sc);
hfi1_rcd_put(rcd);
@@ -794,7 +805,8 @@
ppd->hfi1_wq =
alloc_workqueue(
"hfi%d_%d",
- WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+ WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
+ WQ_MEM_RECLAIM,
HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES,
dd->unit, pidx);
if (!ppd->hfi1_wq)
@@ -833,6 +845,23 @@
}
/**
+ * enable_general_intr() - Enable the IRQs that will be handled by the
+ * general interrupt handler.
+ * @dd: valid devdata
+ *
+ */
+static void enable_general_intr(struct hfi1_devdata *dd)
+{
+ set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, true);
+ set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, true);
+ set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, true);
+ set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, true);
+ set_intr_bits(dd, TCRIT_INT, TCRIT_INT, true);
+ set_intr_bits(dd, IS_DC_START, IS_DC_END, true);
+ set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, true);
+}
+
+/**
* hfi1_init - do the actual initialization sequence on the chip
* @dd: the hfi1_ib device
* @reinit: re-initializing, so don't allocate new memory
@@ -883,10 +912,10 @@
goto done;
/* allocate dummy tail memory for all receive contexts */
- dd->rcvhdrtail_dummy_kvaddr = dma_zalloc_coherent(
- &dd->pcidev->dev, sizeof(u64),
- &dd->rcvhdrtail_dummy_dma,
- GFP_KERNEL);
+ dd->rcvhdrtail_dummy_kvaddr = dma_alloc_coherent(&dd->pcidev->dev,
+ sizeof(u64),
+ &dd->rcvhdrtail_dummy_dma,
+ GFP_KERNEL);
if (!dd->rcvhdrtail_dummy_kvaddr) {
dd_dev_err(dd, "cannot allocate dummy tail memory\n");
@@ -911,11 +940,14 @@
lastfail = hfi1_create_rcvhdrq(dd, rcd);
if (!lastfail)
lastfail = hfi1_setup_eagerbufs(rcd);
+ if (!lastfail)
+ lastfail = hfi1_kern_exp_rcv_init(rcd, reinit);
if (lastfail) {
dd_dev_err(dd,
"failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
ret = lastfail;
}
+ /* enable IRQ */
hfi1_rcd_put(rcd);
}
@@ -954,7 +986,8 @@
HFI1_STATUS_INITTED;
if (!ret) {
/* enable all interrupts from the chip */
- set_intr_state(dd, 1);
+ enable_general_intr(dd);
+ init_qsfp_int(dd);
/* chip is OK for user apps; mark it as initialized */
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
@@ -986,21 +1019,9 @@
return ret;
}
-static inline struct hfi1_devdata *__hfi1_lookup(int unit)
-{
- return idr_find(&hfi1_unit_table, unit);
-}
-
struct hfi1_devdata *hfi1_lookup(int unit)
{
- struct hfi1_devdata *dd;
- unsigned long flags;
-
- spin_lock_irqsave(&hfi1_devs_lock, flags);
- dd = __hfi1_lookup(unit);
- spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-
- return dd;
+ return xa_load(&hfi1_dev_table, unit);
}
/*
@@ -1051,9 +1072,9 @@
}
dd->flags &= ~HFI1_INITTED;
- /* mask and clean up interrupts, but not errors */
- set_intr_state(dd, 0);
- hfi1_clean_up_interrupts(dd);
+ /* mask and clean up interrupts */
+ set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false);
+ msix_clean_up_interrupts(dd);
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
ppd = dd->pport + pidx;
@@ -1168,7 +1189,7 @@
/*
* Release our hold on the shared asic data. If we are the last one,
* return the structure to be finalized outside the lock. Must be
- * holding hfi1_devs_lock.
+ * holding hfi1_dev_table lock.
*/
static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd)
{
@@ -1204,13 +1225,10 @@
struct hfi1_asic_data *ad;
unsigned long flags;
- spin_lock_irqsave(&hfi1_devs_lock, flags);
- if (!list_empty(&dd->list)) {
- idr_remove(&hfi1_unit_table, dd->unit);
- list_del_init(&dd->list);
- }
+ xa_lock_irqsave(&hfi1_dev_table, flags);
+ __xa_erase(&hfi1_dev_table, dd->unit);
ad = release_asic_data(dd);
- spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+ xa_unlock_irqrestore(&hfi1_dev_table, flags);
finalize_asic_data(dd, ad);
free_platform_config(dd);
@@ -1246,17 +1264,18 @@
kobject_put(&dd->kobj);
}
-/*
- * Allocate our primary per-unit data structure. Must be done via verbs
- * allocator, because the verbs cleanup process both does cleanup and
- * free of the data structure.
- * "extra" is for chip-specific data.
+/**
+ * hfi1_alloc_devdata - Allocate our primary per-unit data structure.
+ * @pdev: Valid PCI device
+ * @extra: How many bytes to alloc past the default
*
- * Use the idr mechanism to get a unit number for this unit.
+ * Must be done via verbs allocator, because the verbs cleanup process
+ * both does cleanup and free of the data structure.
+ * "extra" is for chip-specific data.
*/
-struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra)
+static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev,
+ size_t extra)
{
- unsigned long flags;
struct hfi1_devdata *dd;
int ret, nports;
@@ -1271,24 +1290,13 @@
dd->pport = (struct hfi1_pportdata *)(dd + 1);
dd->pcidev = pdev;
pci_set_drvdata(pdev, dd);
+ dd->node = NUMA_NO_NODE;
- INIT_LIST_HEAD(&dd->list);
- idr_preload(GFP_KERNEL);
- spin_lock_irqsave(&hfi1_devs_lock, flags);
-
- ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT);
- if (ret >= 0) {
- dd->unit = ret;
- list_add(&dd->list, &hfi1_dev_list);
- }
- dd->node = -1;
-
- spin_unlock_irqrestore(&hfi1_devs_lock, flags);
- idr_preload_end();
-
+ ret = xa_alloc_irq(&hfi1_dev_table, &dd->unit, dd, xa_limit_32b,
+ GFP_KERNEL);
if (ret < 0) {
- hfi1_early_err(&pdev->dev,
- "Could not allocate unit ID: error %d\n", -ret);
+ dev_err(&pdev->dev,
+ "Could not allocate unit ID: error %d\n", -ret);
goto bail;
}
rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit);
@@ -1309,6 +1317,7 @@
spin_lock_init(&dd->pio_map_lock);
mutex_init(&dd->dc8051_lock);
init_waitqueue_head(&dd->event_queue);
+ spin_lock_init(&dd->irq_src_lock);
dd->int_counter = alloc_percpu(u64);
if (!dd->int_counter) {
@@ -1474,16 +1483,17 @@
/* sanitize link CRC options */
link_crc_mask &= SUPPORTED_CRCS;
+ ret = opfn_init();
+ if (ret < 0) {
+ pr_err("Failed to allocate opfn_wq");
+ goto bail_dev;
+ }
+
/*
* These must be called before the driver is registered with
* the PCI subsystem.
*/
- idr_init(&hfi1_unit_table);
-
hfi1_dbg_init();
- ret = hfi1_wss_init();
- if (ret < 0)
- goto bail_wss;
ret = pci_register_driver(&hfi1_pci_driver);
if (ret < 0) {
pr_err("Unable to register driver: error %d\n", -ret);
@@ -1492,10 +1502,7 @@
goto bail; /* all OK */
bail_dev:
- hfi1_wss_exit();
-bail_wss:
hfi1_dbg_exit();
- idr_destroy(&hfi1_unit_table);
dev_cleanup();
bail:
return ret;
@@ -1509,11 +1516,11 @@
static void __exit hfi1_mod_cleanup(void)
{
pci_unregister_driver(&hfi1_pci_driver);
+ opfn_exit();
node_affinity_destroy_all();
- hfi1_wss_exit();
hfi1_dbg_exit();
- idr_destroy(&hfi1_unit_table);
+ WARN_ON(!xa_empty(&hfi1_dev_table));
dispose_firmware(); /* asymmetric with obtain_firmware() */
dev_cleanup();
}
@@ -1564,7 +1571,7 @@
struct hfi1_ctxtdata *rcd = dd->rcd[ctxt];
if (rcd) {
- hfi1_clear_tids(rcd);
+ hfi1_free_ctxt_rcv_groups(rcd);
hfi1_free_ctxt(rcd);
}
}
@@ -1604,23 +1611,23 @@
hfi1_free_devdata(dd);
}
-static int init_validate_rcvhdrcnt(struct device *dev, uint thecnt)
+static int init_validate_rcvhdrcnt(struct hfi1_devdata *dd, uint thecnt)
{
if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
- hfi1_early_err(dev, "Receive header queue count too small\n");
+ dd_dev_err(dd, "Receive header queue count too small\n");
return -EINVAL;
}
if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) {
- hfi1_early_err(dev,
- "Receive header queue count cannot be greater than %u\n",
- HFI1_MAX_HDRQ_EGRBUF_CNT);
+ dd_dev_err(dd,
+ "Receive header queue count cannot be greater than %u\n",
+ HFI1_MAX_HDRQ_EGRBUF_CNT);
return -EINVAL;
}
if (thecnt % HDRQ_INCREMENT) {
- hfi1_early_err(dev, "Receive header queue count %d must be divisible by %lu\n",
- thecnt, HDRQ_INCREMENT);
+ dd_dev_err(dd, "Receive header queue count %d must be divisible by %lu\n",
+ thecnt, HDRQ_INCREMENT);
return -EINVAL;
}
@@ -1639,22 +1646,29 @@
/* Validate dev ids */
if (!(ent->device == PCI_DEVICE_ID_INTEL0 ||
ent->device == PCI_DEVICE_ID_INTEL1)) {
- hfi1_early_err(&pdev->dev,
- "Failing on unknown Intel deviceid 0x%x\n",
- ent->device);
+ dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n",
+ ent->device);
ret = -ENODEV;
goto bail;
}
+ /* Allocate the dd so we can get to work */
+ dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
+ sizeof(struct hfi1_pportdata));
+ if (IS_ERR(dd)) {
+ ret = PTR_ERR(dd);
+ goto bail;
+ }
+
/* Validate some global module parameters */
- ret = init_validate_rcvhdrcnt(&pdev->dev, rcvhdrcnt);
+ ret = init_validate_rcvhdrcnt(dd, rcvhdrcnt);
if (ret)
goto bail;
/* use the encoding function as a sanitization check */
if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) {
- hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n",
- hfi1_hdrq_entsize);
+ dd_dev_err(dd, "Invalid HdrQ Entry size %u\n",
+ hfi1_hdrq_entsize);
ret = -EINVAL;
goto bail;
}
@@ -1676,10 +1690,10 @@
clamp_val(eager_buffer_size,
MIN_EAGER_BUFFER * 8,
MAX_EAGER_BUFFER_TOTAL);
- hfi1_early_info(&pdev->dev, "Eager buffer size %u\n",
- eager_buffer_size);
+ dd_dev_info(dd, "Eager buffer size %u\n",
+ eager_buffer_size);
} else {
- hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n");
+ dd_dev_err(dd, "Invalid Eager buffer size of 0\n");
ret = -EINVAL;
goto bail;
}
@@ -1687,7 +1701,7 @@
/* restrict value of hfi1_rcvarr_split */
hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100);
- ret = hfi1_pcie_init(pdev, ent);
+ ret = hfi1_pcie_init(dd);
if (ret)
goto bail;
@@ -1695,12 +1709,9 @@
* Do device-specific initialization, function table setup, dd
* allocation, etc.
*/
- dd = hfi1_init_dd(pdev, ent);
-
- if (IS_ERR(dd)) {
- ret = PTR_ERR(dd);
+ ret = hfi1_init_dd(dd);
+ if (ret)
goto clean_bail; /* error already printed */
- }
ret = create_workqueues(dd);
if (ret)
@@ -1731,7 +1742,7 @@
dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
if (initfail || ret) {
- hfi1_clean_up_interrupts(dd);
+ msix_clean_up_interrupts(dd);
stop_timers(dd);
flush_workqueue(ib_wq);
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
@@ -1842,9 +1853,9 @@
gfp_flags = GFP_KERNEL;
else
gfp_flags = GFP_USER;
- rcd->rcvhdrq = dma_zalloc_coherent(
- &dd->pcidev->dev, amt, &rcd->rcvhdrq_dma,
- gfp_flags | __GFP_COMP);
+ rcd->rcvhdrq = dma_alloc_coherent(&dd->pcidev->dev, amt,
+ &rcd->rcvhdrq_dma,
+ gfp_flags | __GFP_COMP);
if (!rcd->rcvhdrq) {
dd_dev_err(dd,
@@ -1855,9 +1866,10 @@
if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ||
HFI1_CAP_UGET_MASK(rcd->flags, DMA_RTAIL)) {
- rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent(
- &dd->pcidev->dev, PAGE_SIZE,
- &rcd->rcvhdrqtailaddr_dma, gfp_flags);
+ rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(&dd->pcidev->dev,
+ PAGE_SIZE,
+ &rcd->rcvhdrqtailaddr_dma,
+ gfp_flags);
if (!rcd->rcvhdrtail_kvaddr)
goto bail_free;
}
@@ -1953,10 +1965,10 @@
while (alloced_bytes < rcd->egrbufs.size &&
rcd->egrbufs.alloced < rcd->egrbufs.count) {
rcd->egrbufs.buffers[idx].addr =
- dma_zalloc_coherent(&dd->pcidev->dev,
- rcd->egrbufs.rcvtid_size,
- &rcd->egrbufs.buffers[idx].dma,
- gfp_flags);
+ dma_alloc_coherent(&dd->pcidev->dev,
+ rcd->egrbufs.rcvtid_size,
+ &rcd->egrbufs.buffers[idx].dma,
+ gfp_flags);
if (rcd->egrbufs.buffers[idx].addr) {
rcd->egrbufs.buffers[idx].len =
rcd->egrbufs.rcvtid_size;
@@ -2027,7 +2039,7 @@
rcd->egrbufs.size = alloced_bytes;
hfi1_cdbg(PROC,
- "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
+ "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %uKB\n",
rcd->ctxt, rcd->egrbufs.alloced,
rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024);
diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c
new file mode 100644
index 0000000..adb4a1b
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/iowait.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#include "iowait.h"
+#include "trace_iowait.h"
+
+/* 1 priority == 16 starve_cnt */
+#define IOWAIT_PRIORITY_STARVE_SHIFT 4
+
+void iowait_set_flag(struct iowait *wait, u32 flag)
+{
+ trace_hfi1_iowait_set(wait, flag);
+ set_bit(flag, &wait->flags);
+}
+
+bool iowait_flag_set(struct iowait *wait, u32 flag)
+{
+ return test_bit(flag, &wait->flags);
+}
+
+inline void iowait_clear_flag(struct iowait *wait, u32 flag)
+{
+ trace_hfi1_iowait_clear(wait, flag);
+ clear_bit(flag, &wait->flags);
+}
+
+/**
+ * iowait_init() - initialize wait structure
+ * @wait: wait struct to initialize
+ * @tx_limit: limit for overflow queuing
+ * @func: restart function for workqueue
+ * @sleep: sleep function for no space
+ * @resume: wakeup function for no space
+ *
+ * This function initializes the iowait
+ * structure embedded in the QP or PQ.
+ *
+ */
+void iowait_init(struct iowait *wait, u32 tx_limit,
+ void (*func)(struct work_struct *work),
+ void (*tidfunc)(struct work_struct *work),
+ int (*sleep)(struct sdma_engine *sde,
+ struct iowait_work *wait,
+ struct sdma_txreq *tx,
+ uint seq,
+ bool pkts_sent),
+ void (*wakeup)(struct iowait *wait, int reason),
+ void (*sdma_drained)(struct iowait *wait),
+ void (*init_priority)(struct iowait *wait))
+{
+ int i;
+
+ wait->count = 0;
+ INIT_LIST_HEAD(&wait->list);
+ init_waitqueue_head(&wait->wait_dma);
+ init_waitqueue_head(&wait->wait_pio);
+ atomic_set(&wait->sdma_busy, 0);
+ atomic_set(&wait->pio_busy, 0);
+ wait->tx_limit = tx_limit;
+ wait->sleep = sleep;
+ wait->wakeup = wakeup;
+ wait->sdma_drained = sdma_drained;
+ wait->init_priority = init_priority;
+ wait->flags = 0;
+ for (i = 0; i < IOWAIT_SES; i++) {
+ wait->wait[i].iow = wait;
+ INIT_LIST_HEAD(&wait->wait[i].tx_head);
+ if (i == IOWAIT_IB_SE)
+ INIT_WORK(&wait->wait[i].iowork, func);
+ else
+ INIT_WORK(&wait->wait[i].iowork, tidfunc);
+ }
+}
+
+/**
+ * iowait_cancel_work - cancel all work in iowait
+ * @w: the iowait struct
+ */
+void iowait_cancel_work(struct iowait *w)
+{
+ cancel_work_sync(&iowait_get_ib_work(w)->iowork);
+ cancel_work_sync(&iowait_get_tid_work(w)->iowork);
+}
+
+/**
+ * iowait_set_work_flag - set work flag based on leg
+ * @w - the iowait work struct
+ */
+int iowait_set_work_flag(struct iowait_work *w)
+{
+ if (w == &w->iow->wait[IOWAIT_IB_SE]) {
+ iowait_set_flag(w->iow, IOWAIT_PENDING_IB);
+ return IOWAIT_IB_SE;
+ }
+ iowait_set_flag(w->iow, IOWAIT_PENDING_TID);
+ return IOWAIT_TID_SE;
+}
+
+/**
+ * iowait_priority_update_top - update the top priority entry
+ * @w: the iowait struct
+ * @top: a pointer to the top priority entry
+ * @idx: the index of the current iowait in an array
+ * @top_idx: the array index for the iowait entry that has the top priority
+ *
+ * This function is called to compare the priority of a given
+ * iowait with the given top priority entry. The top index will
+ * be returned.
+ */
+uint iowait_priority_update_top(struct iowait *w,
+ struct iowait *top,
+ uint idx, uint top_idx)
+{
+ u8 cnt, tcnt;
+
+ /* Convert priority into starve_cnt and compare the total.*/
+ cnt = (w->priority << IOWAIT_PRIORITY_STARVE_SHIFT) + w->starved_cnt;
+ tcnt = (top->priority << IOWAIT_PRIORITY_STARVE_SHIFT) +
+ top->starved_cnt;
+ if (cnt > tcnt)
+ return idx;
+ else
+ return top_idx;
+}
diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h
index 3d9c32c..07847cb 100644
--- a/drivers/infiniband/hw/hfi1/iowait.h
+++ b/drivers/infiniband/hw/hfi1/iowait.h
@@ -1,7 +1,7 @@
#ifndef _HFI1_IOWAIT_H
#define _HFI1_IOWAIT_H
/*
- * Copyright(c) 2015, 2016 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -49,6 +49,7 @@
#include <linux/list.h>
#include <linux/workqueue.h>
+#include <linux/wait.h>
#include <linux/sched.h>
#include "sdma_txreq.h"
@@ -59,16 +60,48 @@
*/
typedef void (*restart_t)(struct work_struct *work);
+#define IOWAIT_PENDING_IB 0x0
+#define IOWAIT_PENDING_TID 0x1
+
+/*
+ * A QP can have multiple Send Engines (SEs).
+ *
+ * The current use case is for supporting a TID RDMA
+ * packet build/xmit mechanism independent from verbs.
+ */
+#define IOWAIT_SES 2
+#define IOWAIT_IB_SE 0
+#define IOWAIT_TID_SE 1
+
struct sdma_txreq;
struct sdma_engine;
/**
- * struct iowait - linkage for delayed progress/waiting
+ * @iowork: the work struct
+ * @tx_head: list of prebuilt packets
+ * @iow: the parent iowait structure
+ *
+ * This structure is the work item (process) specific
+ * details associated with the each of the two SEs of the
+ * QP.
+ *
+ * The workstruct and the queued TXs are unique to each
+ * SE.
+ */
+struct iowait;
+struct iowait_work {
+ struct work_struct iowork;
+ struct list_head tx_head;
+ struct iowait *iow;
+};
+
+/**
* @list: used to add/insert into QP/PQ wait lists
- * @lock: uses to record the list head lock
* @tx_head: overflow list of sdma_txreq's
* @sleep: no space callback
* @wakeup: space callback wakeup
* @sdma_drained: sdma count drained
+ * @init_priority: callback to manipulate priority
+ * @lock: lock protected head of wait queue
* @iowork: workqueue overhead
* @wait_dma: wait for sdma_busy == 0
* @wait_pio: wait for pio_busy == 0
@@ -76,6 +109,8 @@
* @count: total number of descriptors in tx_head'ed list
* @tx_limit: limit for overflow queuing
* @tx_count: number of tx entry's in tx_head'ed list
+ * @flags: wait flags (one per QP)
+ * @wait: SE array for multiple legs
*
* This is to be embedded in user's state structure
* (QP or PQ).
@@ -86,10 +121,13 @@
* are callbacks for the ULP to implement
* what ever queuing/dequeuing of
* the embedded iowait and its containing struct
- * when a resource shortage like SDMA ring space is seen.
+ * when a resource shortage like SDMA ring space
+ * or PIO credit space is seen.
*
* Both potentially have locks help
- * so sleeping is not allowed.
+ * so sleeping is not allowed and it is not
+ * supported to submit txreqs from the wakeup
+ * call directly because of lock conflicts.
*
* The wait_dma member along with the iow
*
@@ -98,21 +136,19 @@
* Waiters explicity know that, but the destroy
* code that unwaits QPs does not.
*/
-
struct iowait {
struct list_head list;
- struct list_head tx_head;
int (*sleep)(
struct sdma_engine *sde,
- struct iowait *wait,
+ struct iowait_work *wait,
struct sdma_txreq *tx,
uint seq,
bool pkts_sent
);
void (*wakeup)(struct iowait *wait, int reason);
void (*sdma_drained)(struct iowait *wait);
+ void (*init_priority)(struct iowait *wait);
seqlock_t *lock;
- struct work_struct iowork;
wait_queue_head_t wait_dma;
wait_queue_head_t wait_pio;
atomic_t sdma_busy;
@@ -121,63 +157,51 @@
u32 tx_limit;
u32 tx_count;
u8 starved_cnt;
+ u8 priority;
+ unsigned long flags;
+ struct iowait_work wait[IOWAIT_SES];
};
#define SDMA_AVAIL_REASON 0
-/**
- * iowait_init() - initialize wait structure
- * @wait: wait struct to initialize
- * @tx_limit: limit for overflow queuing
- * @func: restart function for workqueue
- * @sleep: sleep function for no space
- * @resume: wakeup function for no space
- *
- * This function initializes the iowait
- * structure embedded in the QP or PQ.
- *
- */
+void iowait_set_flag(struct iowait *wait, u32 flag);
+bool iowait_flag_set(struct iowait *wait, u32 flag);
+void iowait_clear_flag(struct iowait *wait, u32 flag);
-static inline void iowait_init(
- struct iowait *wait,
- u32 tx_limit,
- void (*func)(struct work_struct *work),
- int (*sleep)(
- struct sdma_engine *sde,
- struct iowait *wait,
- struct sdma_txreq *tx,
- uint seq,
- bool pkts_sent),
- void (*wakeup)(struct iowait *wait, int reason),
- void (*sdma_drained)(struct iowait *wait))
-{
- wait->count = 0;
- wait->lock = NULL;
- INIT_LIST_HEAD(&wait->list);
- INIT_LIST_HEAD(&wait->tx_head);
- INIT_WORK(&wait->iowork, func);
- init_waitqueue_head(&wait->wait_dma);
- init_waitqueue_head(&wait->wait_pio);
- atomic_set(&wait->sdma_busy, 0);
- atomic_set(&wait->pio_busy, 0);
- wait->tx_limit = tx_limit;
- wait->sleep = sleep;
- wait->wakeup = wakeup;
- wait->sdma_drained = sdma_drained;
-}
+void iowait_init(struct iowait *wait, u32 tx_limit,
+ void (*func)(struct work_struct *work),
+ void (*tidfunc)(struct work_struct *work),
+ int (*sleep)(struct sdma_engine *sde,
+ struct iowait_work *wait,
+ struct sdma_txreq *tx,
+ uint seq,
+ bool pkts_sent),
+ void (*wakeup)(struct iowait *wait, int reason),
+ void (*sdma_drained)(struct iowait *wait),
+ void (*init_priority)(struct iowait *wait));
/**
- * iowait_schedule() - initialize wait structure
+ * iowait_schedule() - schedule the default send engine work
* @wait: wait struct to schedule
* @wq: workqueue for schedule
* @cpu: cpu
*/
-static inline void iowait_schedule(
- struct iowait *wait,
- struct workqueue_struct *wq,
- int cpu)
+static inline bool iowait_schedule(struct iowait *wait,
+ struct workqueue_struct *wq, int cpu)
{
- queue_work_on(cpu, wq, &wait->iowork);
+ return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_IB_SE].iowork);
+}
+
+/**
+ * iowait_tid_schedule - schedule the tid SE
+ * @wait: the iowait structure
+ * @wq: the work queue
+ * @cpu: the cpu
+ */
+static inline bool iowait_tid_schedule(struct iowait *wait,
+ struct workqueue_struct *wq, int cpu)
+{
+ return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_TID_SE].iowork);
}
/**
@@ -228,6 +252,8 @@
*/
static inline int iowait_sdma_dec(struct iowait *wait)
{
+ if (!wait)
+ return 0;
return atomic_dec_and_test(&wait->sdma_busy);
}
@@ -267,11 +293,13 @@
}
/**
- * iowait_sdma_dec - note pio complete
+ * iowait_pio_dec - note pio complete
* @wait: iowait structure
*/
static inline int iowait_pio_dec(struct iowait *wait)
{
+ if (!wait)
+ return 0;
return atomic_dec_and_test(&wait->pio_busy);
}
@@ -293,9 +321,9 @@
/**
* iowait_get_txhead() - get packet off of iowait list
*
- * @wait wait struture
+ * @wait iowait_work struture
*/
-static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait)
+static inline struct sdma_txreq *iowait_get_txhead(struct iowait_work *wait)
{
struct sdma_txreq *tx = NULL;
@@ -309,6 +337,61 @@
return tx;
}
+static inline u16 iowait_get_desc(struct iowait_work *w)
+{
+ u16 num_desc = 0;
+ struct sdma_txreq *tx = NULL;
+
+ if (!list_empty(&w->tx_head)) {
+ tx = list_first_entry(&w->tx_head, struct sdma_txreq,
+ list);
+ num_desc = tx->num_desc;
+ if (tx->flags & SDMA_TXREQ_F_VIP)
+ w->iow->priority++;
+ }
+ return num_desc;
+}
+
+static inline u32 iowait_get_all_desc(struct iowait *w)
+{
+ u32 num_desc = 0;
+
+ num_desc = iowait_get_desc(&w->wait[IOWAIT_IB_SE]);
+ num_desc += iowait_get_desc(&w->wait[IOWAIT_TID_SE]);
+ return num_desc;
+}
+
+static inline void iowait_update_priority(struct iowait_work *w)
+{
+ struct sdma_txreq *tx = NULL;
+
+ if (!list_empty(&w->tx_head)) {
+ tx = list_first_entry(&w->tx_head, struct sdma_txreq,
+ list);
+ if (tx->flags & SDMA_TXREQ_F_VIP)
+ w->iow->priority++;
+ }
+}
+
+static inline void iowait_update_all_priority(struct iowait *w)
+{
+ iowait_update_priority(&w->wait[IOWAIT_IB_SE]);
+ iowait_update_priority(&w->wait[IOWAIT_TID_SE]);
+}
+
+static inline void iowait_init_priority(struct iowait *w)
+{
+ w->priority = 0;
+ if (w->init_priority)
+ w->init_priority(w);
+}
+
+static inline void iowait_get_priority(struct iowait *w)
+{
+ iowait_init_priority(w);
+ iowait_update_all_priority(w);
+}
+
/**
* iowait_queue - Put the iowait on a wait queue
* @pkts_sent: have some packets been sent before queuing?
@@ -325,14 +408,18 @@
/*
* To play fair, insert the iowait at the tail of the wait queue if it
* has already sent some packets; Otherwise, put it at the head.
+ * However, if it has priority packets to send, also put it at the
+ * head.
*/
- if (pkts_sent) {
- list_add_tail(&w->list, wait_head);
+ if (pkts_sent)
w->starved_cnt = 0;
- } else {
- list_add(&w->list, wait_head);
+ else
w->starved_cnt++;
- }
+
+ if (w->priority > 0 || !pkts_sent)
+ list_add(&w->list, wait_head);
+ else
+ list_add_tail(&w->list, wait_head);
}
/**
@@ -349,35 +436,63 @@
w->starved_cnt = 0;
}
-/**
- * iowait_starve_find_max - Find the maximum of the starve count
- * @w: the iowait struct
- * @max: a variable containing the max starve count
- * @idx: the index of the current iowait in an array
- * @max_idx: a variable containing the array index for the
- * iowait entry that has the max starve count
- *
- * This function is called to compare the starve count of a
- * given iowait with the given max starve count. The max starve
- * count and the index will be updated if the iowait's start
- * count is larger.
- */
-static inline void iowait_starve_find_max(struct iowait *w, u8 *max,
- uint idx, uint *max_idx)
-{
- if (w->starved_cnt > *max) {
- *max = w->starved_cnt;
- *max_idx = idx;
- }
-}
+/* Update the top priority index */
+uint iowait_priority_update_top(struct iowait *w,
+ struct iowait *top,
+ uint idx, uint top_idx);
/**
- * iowait_packet_queued() - determine if a packet is already built
- * @wait: the wait structure
+ * iowait_packet_queued() - determine if a packet is queued
+ * @wait: the iowait_work structure
*/
-static inline bool iowait_packet_queued(struct iowait *wait)
+static inline bool iowait_packet_queued(struct iowait_work *wait)
{
return !list_empty(&wait->tx_head);
}
+/**
+ * inc_wait_count - increment wait counts
+ * @w: the log work struct
+ * @n: the count
+ */
+static inline void iowait_inc_wait_count(struct iowait_work *w, u16 n)
+{
+ if (!w)
+ return;
+ w->iow->tx_count++;
+ w->iow->count += n;
+}
+
+/**
+ * iowait_get_tid_work - return iowait_work for tid SE
+ * @w: the iowait struct
+ */
+static inline struct iowait_work *iowait_get_tid_work(struct iowait *w)
+{
+ return &w->wait[IOWAIT_TID_SE];
+}
+
+/**
+ * iowait_get_ib_work - return iowait_work for ib SE
+ * @w: the iowait struct
+ */
+static inline struct iowait_work *iowait_get_ib_work(struct iowait *w)
+{
+ return &w->wait[IOWAIT_IB_SE];
+}
+
+/**
+ * iowait_ioww_to_iow - return iowait given iowait_work
+ * @w: the iowait_work struct
+ */
+static inline struct iowait *iowait_ioww_to_iow(struct iowait_work *w)
+{
+ if (likely(w))
+ return w->iow;
+ return NULL;
+}
+
+void iowait_cancel_work(struct iowait *w);
+int iowait_set_work_flag(struct iowait_work *w);
+
#endif
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
index 0307405..d8ff063 100644
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2015-2017 Intel Corporation.
+ * Copyright(c) 2015-2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -305,7 +305,7 @@
rcu_read_lock();
qp0 = rcu_dereference(ibp->rvp.qp[0]);
if (qp0)
- ah = rdma_create_ah(qp0->ibqp.pd, &attr);
+ ah = rdma_create_ah(qp0->ibqp.pd, &attr, 0);
rcu_read_unlock();
return ah;
}
@@ -2326,7 +2326,7 @@
__be32 vl_select_mask;
};
-#define VL_MASK_ALL 0x000080ff
+#define VL_MASK_ALL 0x00000000000080ffUL
struct opa_port_status_rsp {
__u8 port_num;
@@ -2625,15 +2625,14 @@
}
static void a0_portstatus(struct hfi1_pportdata *ppd,
- struct opa_port_status_rsp *rsp, u32 vl_select_mask)
+ struct opa_port_status_rsp *rsp)
{
if (!is_bx(ppd->dd)) {
unsigned long vl;
u64 sum_vl_xmit_wait = 0;
- u32 vl_all_mask = VL_MASK_ALL;
+ unsigned long vl_all_mask = VL_MASK_ALL;
- for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
- 8 * sizeof(vl_all_mask)) {
+ for_each_set_bit(vl, &vl_all_mask, BITS_PER_LONG) {
u64 tmp = sum_vl_xmit_wait +
read_port_cntr(ppd, C_TX_WAIT_VL,
idx_from_vl(vl));
@@ -2730,12 +2729,12 @@
(struct opa_port_status_req *)pmp->data;
struct hfi1_devdata *dd = dd_from_ibdev(ibdev);
struct opa_port_status_rsp *rsp;
- u32 vl_select_mask = be32_to_cpu(req->vl_select_mask);
+ unsigned long vl_select_mask = be32_to_cpu(req->vl_select_mask);
unsigned long vl;
size_t response_data_size;
u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
u8 port_num = req->port_num;
- u8 num_vls = hweight32(vl_select_mask);
+ u8 num_vls = hweight64(vl_select_mask);
struct _vls_pctrs *vlinfo;
struct hfi1_ibport *ibp = to_iport(ibdev, port);
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
@@ -2744,8 +2743,7 @@
u16 link_width;
u16 link_speed;
- response_data_size = sizeof(struct opa_port_status_rsp) +
- num_vls * sizeof(struct _vls_pctrs);
+ response_data_size = struct_size(rsp, vls, num_vls);
if (response_data_size > sizeof(pmp->data)) {
pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE;
return reply((struct ib_mad_hdr *)pmp);
@@ -2771,7 +2769,7 @@
hfi1_read_link_quality(dd, &rsp->link_quality_indicator);
- rsp->vl_select_mask = cpu_to_be32(vl_select_mask);
+ rsp->vl_select_mask = cpu_to_be32((u32)vl_select_mask);
rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS,
CNTR_INVALID_VL));
rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS,
@@ -2842,8 +2840,7 @@
* So in the for_each_set_bit() loop below, we don't need
* any additional checks for vl.
*/
- for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
- 8 * sizeof(vl_select_mask)) {
+ for_each_set_bit(vl, &vl_select_mask, BITS_PER_LONG) {
memset(vlinfo, 0, sizeof(*vlinfo));
tmp = read_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl));
@@ -2884,7 +2881,7 @@
vfi++;
}
- a0_portstatus(ppd, rsp, vl_select_mask);
+ a0_portstatus(ppd, rsp);
if (resp_len)
*resp_len += response_data_size;
@@ -2931,16 +2928,14 @@
return error_counter_summary;
}
-static void a0_datacounters(struct hfi1_pportdata *ppd, struct _port_dctrs *rsp,
- u32 vl_select_mask)
+static void a0_datacounters(struct hfi1_pportdata *ppd, struct _port_dctrs *rsp)
{
if (!is_bx(ppd->dd)) {
unsigned long vl;
u64 sum_vl_xmit_wait = 0;
- u32 vl_all_mask = VL_MASK_ALL;
+ unsigned long vl_all_mask = VL_MASK_ALL;
- for_each_set_bit(vl, (unsigned long *)&(vl_all_mask),
- 8 * sizeof(vl_all_mask)) {
+ for_each_set_bit(vl, &vl_all_mask, BITS_PER_LONG) {
u64 tmp = sum_vl_xmit_wait +
read_port_cntr(ppd, C_TX_WAIT_VL,
idx_from_vl(vl));
@@ -2995,7 +2990,7 @@
u64 port_mask;
u8 port_num;
unsigned long vl;
- u32 vl_select_mask;
+ unsigned long vl_select_mask;
int vfi;
u16 link_width;
u16 link_speed;
@@ -3014,8 +3009,7 @@
}
/* Sanity check */
- response_data_size = sizeof(struct opa_port_data_counters_msg) +
- num_vls * sizeof(struct _vls_dctrs);
+ response_data_size = struct_size(req, port[0].vls, num_vls);
if (response_data_size > sizeof(pmp->data)) {
pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
@@ -3073,8 +3067,7 @@
* So in the for_each_set_bit() loop below, we don't need
* any additional checks for vl.
*/
- for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
- 8 * sizeof(req->vl_select_mask)) {
+ for_each_set_bit(vl, &vl_select_mask, BITS_PER_LONG) {
memset(vlinfo, 0, sizeof(*vlinfo));
rsp->vls[vfi].port_vl_xmit_data =
@@ -3122,7 +3115,7 @@
vfi++;
}
- a0_datacounters(ppd, rsp, vl_select_mask);
+ a0_datacounters(ppd, rsp);
if (resp_len)
*resp_len += response_data_size;
@@ -3217,7 +3210,7 @@
struct _vls_ectrs *vlinfo;
unsigned long vl;
u64 port_mask, tmp;
- u32 vl_select_mask;
+ unsigned long vl_select_mask;
int vfi;
req = (struct opa_port_error_counters64_msg *)pmp->data;
@@ -3232,8 +3225,7 @@
return reply((struct ib_mad_hdr *)pmp);
}
- response_data_size = sizeof(struct opa_port_error_counters64_msg) +
- num_vls * sizeof(struct _vls_ectrs);
+ response_data_size = struct_size(req, port[0].vls, num_vls);
if (response_data_size > sizeof(pmp->data)) {
pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
@@ -3276,8 +3268,7 @@
vlinfo = &rsp->vls[0];
vfi = 0;
vl_select_mask = be32_to_cpu(req->vl_select_mask);
- for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
- 8 * sizeof(req->vl_select_mask)) {
+ for_each_set_bit(vl, &vl_select_mask, BITS_PER_LONG) {
memset(vlinfo, 0, sizeof(*vlinfo));
rsp->vls[vfi].port_vl_xmit_discards =
cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD_VL,
@@ -3488,7 +3479,7 @@
u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24;
u64 portn = be64_to_cpu(req->port_select_mask[3]);
u32 counter_select = be32_to_cpu(req->counter_select_mask);
- u32 vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */
+ unsigned long vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */
unsigned long vl;
if ((nports != 1) || (portn != 1 << port)) {
@@ -3582,8 +3573,7 @@
if (counter_select & CS_UNCORRECTABLE_ERRORS)
write_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL, 0);
- for_each_set_bit(vl, (unsigned long *)&(vl_select_mask),
- 8 * sizeof(vl_select_mask)) {
+ for_each_set_bit(vl, &vl_select_mask, BITS_PER_LONG) {
if (counter_select & CS_PORT_XMIT_DATA)
write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0);
@@ -4836,7 +4826,7 @@
int ret;
int pkey_idx;
int local_mad = 0;
- u32 resp_len = 0;
+ u32 resp_len = in_wc->byte_len - sizeof(*in_grh);
struct hfi1_ibport *ibp = to_iport(ibdev, port);
pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY);
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
index e1c7996..14d2a90 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -68,8 +68,7 @@
static unsigned long mmu_node_start(struct mmu_rb_node *);
static unsigned long mmu_node_last(struct mmu_rb_node *);
static int mmu_notifier_range_start(struct mmu_notifier *,
- struct mm_struct *,
- unsigned long, unsigned long, bool);
+ const struct mmu_notifier_range *);
static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
unsigned long, unsigned long);
static void do_remove(struct mmu_rb_handler *handler,
@@ -77,7 +76,6 @@
static void handle_remove(struct work_struct *work);
static const struct mmu_notifier_ops mn_opts = {
- .flags = MMU_INVALIDATE_DOES_NOT_BLOCK,
.invalidate_range_start = mmu_notifier_range_start,
};
@@ -285,10 +283,7 @@
}
static int mmu_notifier_range_start(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- bool blockable)
+ const struct mmu_notifier_range *range)
{
struct mmu_rb_handler *handler =
container_of(mn, struct mmu_rb_handler, mn);
@@ -298,10 +293,11 @@
bool added = false;
spin_lock_irqsave(&handler->lock, flags);
- for (node = __mmu_int_rb_iter_first(root, start, end - 1);
+ for (node = __mmu_int_rb_iter_first(root, range->start, range->end-1);
node; node = ptr) {
/* Guard against node removal. */
- ptr = __mmu_int_rb_iter_next(node, start, end - 1);
+ ptr = __mmu_int_rb_iter_next(node, range->start,
+ range->end - 1);
trace_hfi1_mmu_mem_invalidate(node->addr, node->len);
if (handler->ops->invalidate(handler->ops_arg, node)) {
__mmu_int_rb_remove(node, root);
diff --git a/drivers/infiniband/hw/hfi1/msix.c b/drivers/infiniband/hw/hfi1/msix.c
new file mode 100644
index 0000000..d920b16
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/msix.c
@@ -0,0 +1,363 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "hfi.h"
+#include "affinity.h"
+#include "sdma.h"
+
+/**
+ * msix_initialize() - Calculate, request and configure MSIx IRQs
+ * @dd: valid hfi1 devdata
+ *
+ */
+int msix_initialize(struct hfi1_devdata *dd)
+{
+ u32 total;
+ int ret;
+ struct hfi1_msix_entry *entries;
+
+ /*
+ * MSIx interrupt count:
+ * one for the general, "slow path" interrupt
+ * one per used SDMA engine
+ * one per kernel receive context
+ * one for each VNIC context
+ * ...any new IRQs should be added here.
+ */
+ total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts;
+
+ if (total >= CCE_NUM_MSIX_VECTORS)
+ return -EINVAL;
+
+ ret = pci_alloc_irq_vectors(dd->pcidev, total, total, PCI_IRQ_MSIX);
+ if (ret < 0) {
+ dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", ret);
+ return ret;
+ }
+
+ entries = kcalloc(total, sizeof(*dd->msix_info.msix_entries),
+ GFP_KERNEL);
+ if (!entries) {
+ pci_free_irq_vectors(dd->pcidev);
+ return -ENOMEM;
+ }
+
+ dd->msix_info.msix_entries = entries;
+ spin_lock_init(&dd->msix_info.msix_lock);
+ bitmap_zero(dd->msix_info.in_use_msix, total);
+ dd->msix_info.max_requested = total;
+ dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
+
+ return 0;
+}
+
+/**
+ * msix_request_irq() - Allocate a free MSIx IRQ
+ * @dd: valid devdata
+ * @arg: context information for the IRQ
+ * @handler: IRQ handler
+ * @thread: IRQ thread handler (could be NULL)
+ * @idx: zero base idx if multiple devices are needed
+ * @type: affinty IRQ type
+ *
+ * Allocated an MSIx vector if available, and then create the appropriate
+ * meta data needed to keep track of the pci IRQ request.
+ *
+ * Return:
+ * < 0 Error
+ * >= 0 MSIx vector
+ *
+ */
+static int msix_request_irq(struct hfi1_devdata *dd, void *arg,
+ irq_handler_t handler, irq_handler_t thread,
+ u32 idx, enum irq_type type)
+{
+ unsigned long nr;
+ int irq;
+ int ret;
+ const char *err_info;
+ char name[MAX_NAME_SIZE];
+ struct hfi1_msix_entry *me;
+
+ /* Allocate an MSIx vector */
+ spin_lock(&dd->msix_info.msix_lock);
+ nr = find_first_zero_bit(dd->msix_info.in_use_msix,
+ dd->msix_info.max_requested);
+ if (nr < dd->msix_info.max_requested)
+ __set_bit(nr, dd->msix_info.in_use_msix);
+ spin_unlock(&dd->msix_info.msix_lock);
+
+ if (nr == dd->msix_info.max_requested)
+ return -ENOSPC;
+
+ /* Specific verification and determine the name */
+ switch (type) {
+ case IRQ_GENERAL:
+ /* general interrupt must be MSIx vector 0 */
+ if (nr) {
+ spin_lock(&dd->msix_info.msix_lock);
+ __clear_bit(nr, dd->msix_info.in_use_msix);
+ spin_unlock(&dd->msix_info.msix_lock);
+ dd_dev_err(dd, "Invalid index %lu for GENERAL IRQ\n",
+ nr);
+ return -EINVAL;
+ }
+ snprintf(name, sizeof(name), DRIVER_NAME "_%d", dd->unit);
+ err_info = "general";
+ break;
+ case IRQ_SDMA:
+ snprintf(name, sizeof(name), DRIVER_NAME "_%d sdma%d",
+ dd->unit, idx);
+ err_info = "sdma";
+ break;
+ case IRQ_RCVCTXT:
+ snprintf(name, sizeof(name), DRIVER_NAME "_%d kctxt%d",
+ dd->unit, idx);
+ err_info = "receive context";
+ break;
+ case IRQ_OTHER:
+ default:
+ return -EINVAL;
+ }
+ name[sizeof(name) - 1] = 0;
+
+ irq = pci_irq_vector(dd->pcidev, nr);
+ ret = pci_request_irq(dd->pcidev, nr, handler, thread, arg, name);
+ if (ret) {
+ dd_dev_err(dd,
+ "%s: request for IRQ %d failed, MSIx %d, err %d\n",
+ err_info, irq, idx, ret);
+ spin_lock(&dd->msix_info.msix_lock);
+ __clear_bit(nr, dd->msix_info.in_use_msix);
+ spin_unlock(&dd->msix_info.msix_lock);
+ return ret;
+ }
+
+ /*
+ * assign arg after pci_request_irq call, so it will be
+ * cleaned up
+ */
+ me = &dd->msix_info.msix_entries[nr];
+ me->irq = irq;
+ me->arg = arg;
+ me->type = type;
+
+ /* This is a request, so a failure is not fatal */
+ ret = hfi1_get_irq_affinity(dd, me);
+ if (ret)
+ dd_dev_err(dd, "unable to pin IRQ %d\n", ret);
+
+ return nr;
+}
+
+/**
+ * msix_request_rcd_irq() - Helper function for RCVAVAIL IRQs
+ * @rcd: valid rcd context
+ *
+ */
+int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd)
+{
+ int nr;
+
+ nr = msix_request_irq(rcd->dd, rcd, receive_context_interrupt,
+ receive_context_thread, rcd->ctxt, IRQ_RCVCTXT);
+ if (nr < 0)
+ return nr;
+
+ /*
+ * Set the interrupt register and mask for this
+ * context's interrupt.
+ */
+ rcd->ireg = (IS_RCVAVAIL_START + rcd->ctxt) / 64;
+ rcd->imask = ((u64)1) << ((IS_RCVAVAIL_START + rcd->ctxt) % 64);
+ rcd->msix_intr = nr;
+ remap_intr(rcd->dd, IS_RCVAVAIL_START + rcd->ctxt, nr);
+
+ return 0;
+}
+
+/**
+ * msix_request_smda_ira() - Helper for getting SDMA IRQ resources
+ * @sde: valid sdma engine
+ *
+ */
+int msix_request_sdma_irq(struct sdma_engine *sde)
+{
+ int nr;
+
+ nr = msix_request_irq(sde->dd, sde, sdma_interrupt, NULL,
+ sde->this_idx, IRQ_SDMA);
+ if (nr < 0)
+ return nr;
+ sde->msix_intr = nr;
+ remap_sdma_interrupts(sde->dd, sde->this_idx, nr);
+
+ return 0;
+}
+
+/**
+ * enable_sdma_src() - Helper to enable SDMA IRQ srcs
+ * @dd: valid devdata structure
+ * @i: index of SDMA engine
+ */
+static void enable_sdma_srcs(struct hfi1_devdata *dd, int i)
+{
+ set_intr_bits(dd, IS_SDMA_START + i, IS_SDMA_START + i, true);
+ set_intr_bits(dd, IS_SDMA_PROGRESS_START + i,
+ IS_SDMA_PROGRESS_START + i, true);
+ set_intr_bits(dd, IS_SDMA_IDLE_START + i, IS_SDMA_IDLE_START + i, true);
+ set_intr_bits(dd, IS_SDMAENG_ERR_START + i, IS_SDMAENG_ERR_START + i,
+ true);
+}
+
+/**
+ * msix_request_irqs() - Allocate all MSIx IRQs
+ * @dd: valid devdata structure
+ *
+ * Helper function to request the used MSIx IRQs.
+ *
+ */
+int msix_request_irqs(struct hfi1_devdata *dd)
+{
+ int i;
+ int ret;
+
+ ret = msix_request_irq(dd, dd, general_interrupt, NULL, 0, IRQ_GENERAL);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < dd->num_sdma; i++) {
+ struct sdma_engine *sde = &dd->per_sdma[i];
+
+ ret = msix_request_sdma_irq(sde);
+ if (ret)
+ return ret;
+ enable_sdma_srcs(sde->dd, i);
+ }
+
+ for (i = 0; i < dd->n_krcv_queues; i++) {
+ struct hfi1_ctxtdata *rcd = hfi1_rcd_get_by_index_safe(dd, i);
+
+ if (rcd)
+ ret = msix_request_rcd_irq(rcd);
+ hfi1_rcd_put(rcd);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/**
+ * msix_free_irq() - Free the specified MSIx resources and IRQ
+ * @dd: valid devdata
+ * @msix_intr: MSIx vector to free.
+ *
+ */
+void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr)
+{
+ struct hfi1_msix_entry *me;
+
+ if (msix_intr >= dd->msix_info.max_requested)
+ return;
+
+ me = &dd->msix_info.msix_entries[msix_intr];
+
+ if (!me->arg) /* => no irq, no affinity */
+ return;
+
+ hfi1_put_irq_affinity(dd, me);
+ pci_free_irq(dd->pcidev, msix_intr, me->arg);
+
+ me->arg = NULL;
+
+ spin_lock(&dd->msix_info.msix_lock);
+ __clear_bit(msix_intr, dd->msix_info.in_use_msix);
+ spin_unlock(&dd->msix_info.msix_lock);
+}
+
+/**
+ * hfi1_clean_up_msix_interrupts() - Free all MSIx IRQ resources
+ * @dd: valid device data data structure
+ *
+ * Free the MSIx and associated PCI resources, if they have been allocated.
+ */
+void msix_clean_up_interrupts(struct hfi1_devdata *dd)
+{
+ int i;
+ struct hfi1_msix_entry *me = dd->msix_info.msix_entries;
+
+ /* remove irqs - must happen before disabling/turning off */
+ for (i = 0; i < dd->msix_info.max_requested; i++, me++)
+ msix_free_irq(dd, i);
+
+ /* clean structures */
+ kfree(dd->msix_info.msix_entries);
+ dd->msix_info.msix_entries = NULL;
+ dd->msix_info.max_requested = 0;
+
+ pci_free_irq_vectors(dd->pcidev);
+}
+
+/**
+ * msix_vnic_syncrhonize_irq() - Vnic IRQ synchronize
+ * @dd: valid devdata
+ */
+void msix_vnic_synchronize_irq(struct hfi1_devdata *dd)
+{
+ int i;
+
+ for (i = 0; i < dd->vnic.num_ctxt; i++) {
+ struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i];
+ struct hfi1_msix_entry *me;
+
+ me = &dd->msix_info.msix_entries[rcd->msix_intr];
+
+ synchronize_irq(me->irq);
+ }
+}
diff --git a/drivers/infiniband/hw/hfi1/msix.h b/drivers/infiniband/hw/hfi1/msix.h
new file mode 100644
index 0000000..a514881
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/msix.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ * This file is provided under a dual BSD/GPLv2 license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * - Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef _HFI1_MSIX_H
+#define _HFI1_MSIX_H
+
+#include "hfi.h"
+
+/* MSIx interface */
+int msix_initialize(struct hfi1_devdata *dd);
+int msix_request_irqs(struct hfi1_devdata *dd);
+void msix_clean_up_interrupts(struct hfi1_devdata *dd);
+int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd);
+int msix_request_sdma_irq(struct sdma_engine *sde);
+void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr);
+
+/* VNIC interface */
+void msix_vnic_synchronize_irq(struct hfi1_devdata *dd);
+
+#endif
diff --git a/drivers/infiniband/hw/hfi1/opfn.c b/drivers/infiniband/hw/hfi1/opfn.c
new file mode 100644
index 0000000..370a5a8
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/opfn.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#include "hfi.h"
+#include "trace.h"
+#include "qp.h"
+#include "opfn.h"
+
+#define IB_BTHE_E BIT(IB_BTHE_E_SHIFT)
+
+#define OPFN_CODE(code) BIT((code) - 1)
+#define OPFN_MASK(code) OPFN_CODE(STL_VERBS_EXTD_##code)
+
+struct hfi1_opfn_type {
+ bool (*request)(struct rvt_qp *qp, u64 *data);
+ bool (*response)(struct rvt_qp *qp, u64 *data);
+ bool (*reply)(struct rvt_qp *qp, u64 data);
+ void (*error)(struct rvt_qp *qp);
+};
+
+static struct hfi1_opfn_type hfi1_opfn_handlers[STL_VERBS_EXTD_MAX] = {
+ [STL_VERBS_EXTD_TID_RDMA] = {
+ .request = tid_rdma_conn_req,
+ .response = tid_rdma_conn_resp,
+ .reply = tid_rdma_conn_reply,
+ .error = tid_rdma_conn_error,
+ },
+};
+
+static struct workqueue_struct *opfn_wq;
+
+static void opfn_schedule_conn_request(struct rvt_qp *qp);
+
+static bool hfi1_opfn_extended(u32 bth1)
+{
+ return !!(bth1 & IB_BTHE_E);
+}
+
+static void opfn_conn_request(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct ib_atomic_wr wr;
+ u16 mask, capcode;
+ struct hfi1_opfn_type *extd;
+ u64 data;
+ unsigned long flags;
+ int ret = 0;
+
+ trace_hfi1_opfn_state_conn_request(qp);
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ /*
+ * Exit if the extended bit is not set, or if nothing is requested, or
+ * if we have completed all requests, or if a previous request is in
+ * progress
+ */
+ if (!priv->opfn.extended || !priv->opfn.requested ||
+ priv->opfn.requested == priv->opfn.completed || priv->opfn.curr)
+ goto done;
+
+ mask = priv->opfn.requested & ~priv->opfn.completed;
+ capcode = ilog2(mask & ~(mask - 1)) + 1;
+ if (capcode >= STL_VERBS_EXTD_MAX) {
+ priv->opfn.completed |= OPFN_CODE(capcode);
+ goto done;
+ }
+
+ extd = &hfi1_opfn_handlers[capcode];
+ if (!extd || !extd->request || !extd->request(qp, &data)) {
+ /*
+ * Either there is no handler for this capability or the request
+ * packet could not be generated. Either way, mark it as done so
+ * we don't keep attempting to complete it.
+ */
+ priv->opfn.completed |= OPFN_CODE(capcode);
+ goto done;
+ }
+
+ trace_hfi1_opfn_data_conn_request(qp, capcode, data);
+ data = (data & ~0xf) | capcode;
+
+ memset(&wr, 0, sizeof(wr));
+ wr.wr.opcode = IB_WR_OPFN;
+ wr.remote_addr = HFI1_VERBS_E_ATOMIC_VADDR;
+ wr.compare_add = data;
+
+ priv->opfn.curr = capcode; /* A new request is now in progress */
+ /* Drop opfn.lock before calling ib_post_send() */
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+
+ ret = ib_post_send(&qp->ibqp, &wr.wr, NULL);
+ if (ret)
+ goto err;
+ trace_hfi1_opfn_state_conn_request(qp);
+ return;
+err:
+ trace_hfi1_msg_opfn_conn_request(qp, "ib_ost_send failed: ret = ",
+ (u64)ret);
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ /*
+ * In case of an unexpected error return from ib_post_send
+ * clear opfn.curr and reschedule to try again
+ */
+ priv->opfn.curr = STL_VERBS_EXTD_NONE;
+ opfn_schedule_conn_request(qp);
+done:
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_send_conn_request(struct work_struct *work)
+{
+ struct hfi1_opfn_data *od;
+ struct hfi1_qp_priv *qpriv;
+
+ od = container_of(work, struct hfi1_opfn_data, opfn_work);
+ qpriv = container_of(od, struct hfi1_qp_priv, opfn);
+
+ opfn_conn_request(qpriv->owner);
+}
+
+/*
+ * When QP s_lock is held in the caller, the OPFN request must be scheduled
+ * to a different workqueue to avoid double locking QP s_lock in call to
+ * ib_post_send in opfn_conn_request
+ */
+static void opfn_schedule_conn_request(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ trace_hfi1_opfn_state_sched_conn_request(qp);
+ queue_work(opfn_wq, &priv->opfn.opfn_work);
+}
+
+void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_atomic_eth *ateth)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ u64 data = be64_to_cpu(ateth->compare_data);
+ struct hfi1_opfn_type *extd;
+ u8 capcode;
+ unsigned long flags;
+
+ trace_hfi1_opfn_state_conn_response(qp);
+ capcode = data & 0xf;
+ trace_hfi1_opfn_data_conn_response(qp, capcode, data);
+ if (!capcode || capcode >= STL_VERBS_EXTD_MAX)
+ return;
+
+ extd = &hfi1_opfn_handlers[capcode];
+
+ if (!extd || !extd->response) {
+ e->atomic_data = capcode;
+ return;
+ }
+
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ if (priv->opfn.completed & OPFN_CODE(capcode)) {
+ /*
+ * We are receiving a request for a feature that has already
+ * been negotiated. This may mean that the other side has reset
+ */
+ priv->opfn.completed &= ~OPFN_CODE(capcode);
+ if (extd->error)
+ extd->error(qp);
+ }
+
+ if (extd->response(qp, &data))
+ priv->opfn.completed |= OPFN_CODE(capcode);
+ e->atomic_data = (data & ~0xf) | capcode;
+ trace_hfi1_opfn_state_conn_response(qp);
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_conn_reply(struct rvt_qp *qp, u64 data)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_opfn_type *extd;
+ u8 capcode;
+ unsigned long flags;
+
+ trace_hfi1_opfn_state_conn_reply(qp);
+ capcode = data & 0xf;
+ trace_hfi1_opfn_data_conn_reply(qp, capcode, data);
+ if (!capcode || capcode >= STL_VERBS_EXTD_MAX)
+ return;
+
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ /*
+ * Either there is no previous request or the reply is not for the
+ * current request
+ */
+ if (!priv->opfn.curr || capcode != priv->opfn.curr)
+ goto done;
+
+ extd = &hfi1_opfn_handlers[capcode];
+
+ if (!extd || !extd->reply)
+ goto clear;
+
+ if (extd->reply(qp, data))
+ priv->opfn.completed |= OPFN_CODE(capcode);
+clear:
+ /*
+ * Clear opfn.curr to indicate that the previous request is no longer in
+ * progress
+ */
+ priv->opfn.curr = STL_VERBS_EXTD_NONE;
+ trace_hfi1_opfn_state_conn_reply(qp);
+done:
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_conn_error(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_opfn_type *extd = NULL;
+ unsigned long flags;
+ u16 capcode;
+
+ trace_hfi1_opfn_state_conn_error(qp);
+ trace_hfi1_msg_opfn_conn_error(qp, "error. qp state ", (u64)qp->state);
+ /*
+ * The QP has gone into the Error state. We have to invalidate all
+ * negotiated feature, including the one in progress (if any). The RC
+ * QP handling will clean the WQE for the connection request.
+ */
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ while (priv->opfn.completed) {
+ capcode = priv->opfn.completed & ~(priv->opfn.completed - 1);
+ extd = &hfi1_opfn_handlers[ilog2(capcode) + 1];
+ if (extd->error)
+ extd->error(qp);
+ priv->opfn.completed &= ~OPFN_CODE(capcode);
+ }
+ priv->opfn.extended = 0;
+ priv->opfn.requested = 0;
+ priv->opfn.curr = STL_VERBS_EXTD_NONE;
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask)
+{
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ unsigned long flags;
+
+ if (attr_mask & IB_QP_RETRY_CNT)
+ priv->s_retry = attr->retry_cnt;
+
+ spin_lock_irqsave(&priv->opfn.lock, flags);
+ if (ibqp->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+ struct tid_rdma_params *local = &priv->tid_rdma.local;
+
+ if (attr_mask & IB_QP_TIMEOUT)
+ priv->tid_retry_timeout_jiffies = qp->timeout_jiffies;
+ if (qp->pmtu == enum_to_mtu(OPA_MTU_4096) ||
+ qp->pmtu == enum_to_mtu(OPA_MTU_8192)) {
+ tid_rdma_opfn_init(qp, local);
+ /*
+ * We only want to set the OPFN requested bit when the
+ * QP transitions to RTS.
+ */
+ if (attr_mask & IB_QP_STATE &&
+ attr->qp_state == IB_QPS_RTS) {
+ priv->opfn.requested |= OPFN_MASK(TID_RDMA);
+ /*
+ * If the QP is transitioning to RTS and the
+ * opfn.completed for TID RDMA has already been
+ * set, the QP is being moved *back* into RTS.
+ * We can now renegotiate the TID RDMA
+ * parameters.
+ */
+ if (priv->opfn.completed &
+ OPFN_MASK(TID_RDMA)) {
+ priv->opfn.completed &=
+ ~OPFN_MASK(TID_RDMA);
+ /*
+ * Since the opfn.completed bit was
+ * already set, it is safe to assume
+ * that the opfn.extended is also set.
+ */
+ opfn_schedule_conn_request(qp);
+ }
+ }
+ } else {
+ memset(local, 0, sizeof(*local));
+ }
+ }
+ spin_unlock_irqrestore(&priv->opfn.lock, flags);
+}
+
+void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (!priv->opfn.extended && hfi1_opfn_extended(bth1) &&
+ HFI1_CAP_IS_KSET(OPFN)) {
+ priv->opfn.extended = 1;
+ if (qp->state == IB_QPS_RTS)
+ opfn_conn_request(qp);
+ }
+}
+
+int opfn_init(void)
+{
+ opfn_wq = alloc_workqueue("hfi_opfn",
+ WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
+ WQ_MEM_RECLAIM,
+ HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES);
+ if (!opfn_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void opfn_exit(void)
+{
+ if (opfn_wq) {
+ destroy_workqueue(opfn_wq);
+ opfn_wq = NULL;
+ }
+}
diff --git a/drivers/infiniband/hw/hfi1/opfn.h b/drivers/infiniband/hw/hfi1/opfn.h
new file mode 100644
index 0000000..62f93c1
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/opfn.h
@@ -0,0 +1,87 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#ifndef _HFI1_OPFN_H
+#define _HFI1_OPFN_H
+
+/**
+ * DOC: Omni Path Feature Negotion (OPFN)
+ *
+ * OPFN is a discovery protocol for Intel Omni-Path fabric that
+ * allows two RC QPs to negotiate a common feature that both QPs
+ * can support. Currently, the only OPA feature that OPFN
+ * supports is TID RDMA.
+ *
+ * Architecture
+ *
+ * OPFN involves the communication between two QPs on the HFI
+ * level on an Omni-Path fabric, and ULPs have no knowledge of
+ * OPFN at all.
+ *
+ * Implementation
+ *
+ * OPFN extends the existing IB RC protocol with the following
+ * changes:
+ * -- Uses Bit 24 (reserved) of DWORD 1 of Base Transport
+ * Header (BTH1) to indicate that the RC QP supports OPFN;
+ * -- Uses a combination of RC COMPARE_SWAP opcode (0x13) and
+ * the address U64_MAX (0xFFFFFFFFFFFFFFFF) as an OPFN
+ * request; The 64-bit data carried with the request/response
+ * contains the parameters for negotiation and will be
+ * defined in tid_rdma.c file;
+ * -- Defines IB_WR_RESERVED3 as IB_WR_OPFN.
+ *
+ * The OPFN communication will be triggered when an RC QP
+ * receives a request with Bit 24 of BTH1 set. The responder QP
+ * will then post send an OPFN request with its local
+ * parameters, which will be sent to the requester QP once all
+ * existing requests on the responder QP side have been sent.
+ * Once the requester QP receives the OPFN request, it will
+ * keep a copy of the responder QP's parameters, and return a
+ * response packet with its own local parameters. The responder
+ * QP receives the response packet and keeps a copy of the requester
+ * QP's parameters. After this exchange, each side has the parameters
+ * for both sides and therefore can select the right parameters
+ * for future transactions
+ */
+
+#include <linux/workqueue.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdmavt_qp.h>
+
+/* STL Verbs Extended */
+#define IB_BTHE_E_SHIFT 24
+#define HFI1_VERBS_E_ATOMIC_VADDR U64_MAX
+
+enum hfi1_opfn_codes {
+ STL_VERBS_EXTD_NONE = 0,
+ STL_VERBS_EXTD_TID_RDMA,
+ STL_VERBS_EXTD_MAX
+};
+
+struct hfi1_opfn_data {
+ u8 extended;
+ u16 requested;
+ u16 completed;
+ enum hfi1_opfn_codes curr;
+ /* serialize opfn function calls */
+ spinlock_t lock;
+ struct work_struct opfn_work;
+};
+
+/* WR opcode for OPFN */
+#define IB_WR_OPFN IB_WR_RESERVED3
+
+void opfn_send_conn_request(struct work_struct *work);
+void opfn_conn_response(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_atomic_eth *ateth);
+void opfn_conn_reply(struct rvt_qp *qp, u64 data);
+void opfn_conn_error(struct rvt_qp *qp);
+void opfn_qp_init(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask);
+void opfn_trigger_conn_request(struct rvt_qp *qp, u32 bth1);
+int opfn_init(void);
+void opfn_exit(void);
+
+#endif /* _HFI1_OPFN_H */
diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
index 6c967dd..61362bd 100644
--- a/drivers/infiniband/hw/hfi1/pcie.c
+++ b/drivers/infiniband/hw/hfi1/pcie.c
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2019 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -61,19 +61,12 @@
*/
/*
- * Code to adjust PCIe capabilities.
- */
-static void tune_pcie_caps(struct hfi1_devdata *);
-
-/*
* Do all the common PCIe setup and initialization.
- * devdata is not yet allocated, and is not allocated until after this
- * routine returns success. Therefore dd_dev_err() can't be used for error
- * printing.
*/
-int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
+int hfi1_pcie_init(struct hfi1_devdata *dd)
{
int ret;
+ struct pci_dev *pdev = dd->pcidev;
ret = pci_enable_device(pdev);
if (ret) {
@@ -89,15 +82,13 @@
* about that, it appears. If the original BAR was retained
* in the kernel data structures, this may be OK.
*/
- hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n",
- -ret);
- goto done;
+ dd_dev_err(dd, "pci enable failed: error %d\n", -ret);
+ return ret;
}
ret = pci_request_regions(pdev, DRIVER_NAME);
if (ret) {
- hfi1_early_err(&pdev->dev,
- "pci_request_regions fails: err %d\n", -ret);
+ dd_dev_err(dd, "pci_request_regions fails: err %d\n", -ret);
goto bail;
}
@@ -110,8 +101,7 @@
*/
ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
if (ret) {
- hfi1_early_err(&pdev->dev,
- "Unable to set DMA mask: %d\n", ret);
+ dd_dev_err(dd, "Unable to set DMA mask: %d\n", ret);
goto bail;
}
ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
@@ -119,18 +109,16 @@
ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
}
if (ret) {
- hfi1_early_err(&pdev->dev,
- "Unable to set DMA consistent mask: %d\n", ret);
+ dd_dev_err(dd, "Unable to set DMA consistent mask: %d\n", ret);
goto bail;
}
pci_set_master(pdev);
(void)pci_enable_pcie_error_reporting(pdev);
- goto done;
+ return 0;
bail:
hfi1_pcie_cleanup(pdev);
-done:
return ret;
}
@@ -206,7 +194,7 @@
dd_dev_err(dd, "WC mapping of send buffers failed\n");
goto nomem;
}
- dd_dev_info(dd, "WC piobase: %p\n for %x", dd->piobase, TXE_PIO_SIZE);
+ dd_dev_info(dd, "WC piobase: %p for %x\n", dd->piobase, TXE_PIO_SIZE);
dd->physaddr = addr; /* used for io_remap, etc. */
@@ -331,7 +319,9 @@
/*
* bus->max_bus_speed is set from the bridge's linkcap Max Link Speed
*/
- if (parent && dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) {
+ if (parent &&
+ (dd->pcidev->bus->max_bus_speed == PCIE_SPEED_2_5GT ||
+ dd->pcidev->bus->max_bus_speed == PCIE_SPEED_5_0GT)) {
dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n");
dd->link_gen3_capable = 0;
}
@@ -344,26 +334,6 @@
return 0;
}
-/*
- * Returns:
- * - actual number of interrupts allocated or
- * - error
- */
-int request_msix(struct hfi1_devdata *dd, u32 msireq)
-{
- int nvec;
-
- nvec = pci_alloc_irq_vectors(dd->pcidev, msireq, msireq, PCI_IRQ_MSIX);
- if (nvec < 0) {
- dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", nvec);
- return nvec;
- }
-
- tune_pcie_caps(dd);
-
- return nvec;
-}
-
/* restore command and BARs after a reset has wiped them out */
int restore_pci_variables(struct hfi1_devdata *dd)
{
@@ -479,14 +449,15 @@
* Check and optionally adjust them to maximize our throughput.
*/
static int hfi1_pcie_caps;
-module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO);
+module_param_named(pcie_caps, hfi1_pcie_caps, int, 0444);
MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
-uint aspm_mode = ASPM_MODE_DISABLED;
-module_param_named(aspm, aspm_mode, uint, S_IRUGO);
-MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic");
-
-static void tune_pcie_caps(struct hfi1_devdata *dd)
+/**
+ * tune_pcie_caps() - Code to adjust PCIe capabilities.
+ * @dd: Valid device data structure
+ *
+ */
+void tune_pcie_caps(struct hfi1_devdata *dd)
{
struct pci_dev *parent;
u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
@@ -650,7 +621,6 @@
struct hfi1_devdata *dd = pci_get_drvdata(pdev);
dd_dev_info(dd, "HFI1 resume function called\n");
- pci_cleanup_aer_uncorrect_error_status(pdev);
/*
* Running jobs will fail, since it's asynchronous
* unlike sysfs-requested reset. Better than
@@ -1029,6 +999,7 @@
const u8 (*ctle_tunings)[4];
uint static_ctle_mode;
int return_error = 0;
+ u32 target_width;
/* PCIe Gen3 is for the ASIC only */
if (dd->icode != ICODE_RTL_SILICON)
@@ -1068,6 +1039,9 @@
return 0;
}
+ /* Previous Gen1/Gen2 bus width */
+ target_width = dd->lbus_width;
+
/*
* Do the Gen3 transition. Steps are those of the PCIe Gen3
* recipe.
@@ -1436,11 +1410,12 @@
dd_dev_info(dd, "%s: new speed and width: %s\n", __func__,
dd->lbus_info);
- if (dd->lbus_speed != target_speed) { /* not target */
+ if (dd->lbus_speed != target_speed ||
+ dd->lbus_width < target_width) { /* not target */
/* maybe retry */
do_retry = retry_count < pcie_retry;
- dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n",
- pcie_target, do_retry ? ", retrying" : "");
+ dd_dev_err(dd, "PCIe link speed or width did not match target%s\n",
+ do_retry ? ", retrying" : "");
retry_count++;
if (do_retry) {
msleep(100); /* allow time to settle */
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
index 7520576..79126b2 100644
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -71,14 +71,6 @@
}
}
-/* defined in header release 48 and higher */
-#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT
-#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3
-#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull
-#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \
- << SEND_CTRL_UNSUPPORTED_VL_SHIFT)
-#endif
-
/* global control of PIO send */
void pio_send_control(struct hfi1_devdata *dd, int op)
{
@@ -750,6 +742,7 @@
spin_lock_init(&sc->alloc_lock);
spin_lock_init(&sc->release_lock);
spin_lock_init(&sc->credit_ctrl_lock);
+ seqlock_init(&sc->waitlock);
INIT_LIST_HEAD(&sc->piowait);
INIT_WORK(&sc->halt_work, sc_halted);
init_waitqueue_head(&sc->halt_wait);
@@ -959,6 +952,22 @@
}
}
spin_unlock(&sc->release_lock);
+
+ write_seqlock(&sc->waitlock);
+ while (!list_empty(&sc->piowait)) {
+ struct iowait *wait;
+ struct rvt_qp *qp;
+ struct hfi1_qp_priv *priv;
+
+ wait = list_first_entry(&sc->piowait, struct iowait, list);
+ qp = iowait_to_qp(wait);
+ priv = qp->priv;
+ list_del_init(&priv->s_iowait.list);
+ priv->s_iowait.lock = NULL;
+ hfi1_qp_wakeup(qp, RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
+ }
+ write_sequnlock(&sc->waitlock);
+
spin_unlock_irq(&sc->alloc_lock);
}
@@ -1434,7 +1443,8 @@
* @cb: optional callback to call when the buffer is finished sending
* @arg: argument for cb
*
- * Return a pointer to a PIO buffer if successful, NULL if not enough room.
+ * Return a pointer to a PIO buffer, NULL if not enough room, -ECOMM
+ * when link is down.
*/
struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len,
pio_release_cb cb, void *arg)
@@ -1450,7 +1460,7 @@
spin_lock_irqsave(&sc->alloc_lock, flags);
if (!(sc->flags & SCF_ENABLED)) {
spin_unlock_irqrestore(&sc->alloc_lock, flags);
- goto done;
+ return ERR_PTR(-ECOMM);
}
retry:
@@ -1584,10 +1594,8 @@
else
sc_del_credit_return_intr(sc);
trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl);
- if (needint) {
- mmiowb();
+ if (needint)
sc_return_credits(sc);
- }
}
/**
@@ -1601,14 +1609,12 @@
static void sc_piobufavail(struct send_context *sc)
{
struct hfi1_devdata *dd = sc->dd;
- struct hfi1_ibdev *dev = &dd->verbs_dev;
struct list_head *list;
struct rvt_qp *qps[PIO_WAIT_BATCH_SIZE];
struct rvt_qp *qp;
struct hfi1_qp_priv *priv;
unsigned long flags;
- uint i, n = 0, max_idx = 0;
- u8 max_starved_cnt = 0;
+ uint i, n = 0, top_idx = 0;
if (dd->send_contexts[sc->sw_index].type != SC_KERNEL &&
dd->send_contexts[sc->sw_index].type != SC_VL15)
@@ -1620,18 +1626,25 @@
* could end up with QPs on the wait list with the interrupt
* disabled.
*/
- write_seqlock_irqsave(&dev->iowait_lock, flags);
+ write_seqlock_irqsave(&sc->waitlock, flags);
while (!list_empty(list)) {
struct iowait *wait;
if (n == ARRAY_SIZE(qps))
break;
wait = list_first_entry(list, struct iowait, list);
+ iowait_get_priority(wait);
qp = iowait_to_qp(wait);
priv = qp->priv;
list_del_init(&priv->s_iowait.list);
priv->s_iowait.lock = NULL;
- iowait_starve_find_max(wait, &max_starved_cnt, n, &max_idx);
+ if (n) {
+ priv = qps[top_idx]->priv;
+ top_idx = iowait_priority_update_top(wait,
+ &priv->s_iowait,
+ n, top_idx);
+ }
+
/* refcount held until actual wake up */
qps[n++] = qp;
}
@@ -1644,14 +1657,14 @@
if (!list_empty(list))
hfi1_sc_wantpiobuf_intr(sc, 1);
}
- write_sequnlock_irqrestore(&dev->iowait_lock, flags);
+ write_sequnlock_irqrestore(&sc->waitlock, flags);
- /* Wake up the most starved one first */
+ /* Wake up the top-priority one first */
if (n)
- hfi1_qp_wakeup(qps[max_idx],
+ hfi1_qp_wakeup(qps[top_idx],
RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
for (i = 0; i < n; i++)
- if (i != max_idx)
+ if (i != top_idx)
hfi1_qp_wakeup(qps[i],
RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
}
@@ -2106,11 +2119,10 @@
int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return);
set_dev_node(&dd->pcidev->dev, i);
- dd->cr_base[i].va = dma_zalloc_coherent(
- &dd->pcidev->dev,
- bytes,
- &dd->cr_base[i].dma,
- GFP_KERNEL);
+ dd->cr_base[i].va = dma_alloc_coherent(&dd->pcidev->dev,
+ bytes,
+ &dd->cr_base[i].dma,
+ GFP_KERNEL);
if (!dd->cr_base[i].va) {
set_dev_node(&dd->pcidev->dev, dd->node);
dd_dev_err(dd,
@@ -2145,3 +2157,28 @@
kfree(dd->cr_base);
dd->cr_base = NULL;
}
+
+void seqfile_dump_sci(struct seq_file *s, u32 i,
+ struct send_context_info *sci)
+{
+ struct send_context *sc = sci->sc;
+ u64 reg;
+
+ seq_printf(s, "SCI %u: type %u base %u credits %u\n",
+ i, sci->type, sci->base, sci->credits);
+ seq_printf(s, " flags 0x%x sw_inx %u hw_ctxt %u grp %u\n",
+ sc->flags, sc->sw_index, sc->hw_context, sc->group);
+ seq_printf(s, " sr_size %u credits %u sr_head %u sr_tail %u\n",
+ sc->sr_size, sc->credits, sc->sr_head, sc->sr_tail);
+ seq_printf(s, " fill %lu free %lu fill_wrap %u alloc_free %lu\n",
+ sc->fill, sc->free, sc->fill_wrap, sc->alloc_free);
+ seq_printf(s, " credit_intr_count %u credit_ctrl 0x%llx\n",
+ sc->credit_intr_count, sc->credit_ctrl);
+ reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_STATUS));
+ seq_printf(s, " *hw_free %llu CurrentFree %llu LastReturned %llu\n",
+ (le64_to_cpu(*sc->hw_free) & CR_COUNTER_SMASK) >>
+ CR_COUNTER_SHIFT,
+ (reg >> SC(CREDIT_STATUS_CURRENT_FREE_COUNTER_SHIFT)) &
+ SC(CREDIT_STATUS_CURRENT_FREE_COUNTER_MASK),
+ reg & SC(CREDIT_STATUS_LAST_RETURNED_COUNTER_SMASK));
+}
diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h
index aaf372c..c9a58b6 100644
--- a/drivers/infiniband/hw/hfi1/pio.h
+++ b/drivers/infiniband/hw/hfi1/pio.h
@@ -127,6 +127,8 @@
volatile __le64 *hw_free; /* HW free counter */
/* list for PIO waiters */
struct list_head piowait ____cacheline_aligned_in_smp;
+ seqlock_t waitlock;
+
spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp;
u32 credit_intr_count; /* count of credit intr users */
u64 credit_ctrl; /* cache for credit control */
@@ -329,4 +331,7 @@
void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes);
void seg_pio_copy_end(struct pio_buf *pbuf);
+void seqfile_dump_sci(struct seq_file *s, u32 i,
+ struct send_context_info *sci);
+
#endif /* _PIO_H */
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index 9b1e84a..f8e733a 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2019 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -66,7 +66,7 @@
static void flush_tx_list(struct rvt_qp *qp);
static int iowait_sleep(
struct sdma_engine *sde,
- struct iowait *wait,
+ struct iowait_work *wait,
struct sdma_txreq *stx,
unsigned int seq,
bool pkts_sent);
@@ -132,17 +132,27 @@
.qpt_support = BIT(IB_QPT_RC),
},
+[IB_WR_OPFN] = {
+ .length = sizeof(struct ib_atomic_wr),
+ .qpt_support = BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_USE_RESERVE,
+},
+
+[IB_WR_TID_RDMA_WRITE] = {
+ .length = sizeof(struct ib_rdma_wr),
+ .qpt_support = BIT(IB_QPT_RC),
+ .flags = RVT_OPERATION_IGN_RNR_CNT,
+},
+
};
-static void flush_tx_list(struct rvt_qp *qp)
+static void flush_list_head(struct list_head *l)
{
- struct hfi1_qp_priv *priv = qp->priv;
-
- while (!list_empty(&priv->s_iowait.tx_head)) {
+ while (!list_empty(l)) {
struct sdma_txreq *tx;
tx = list_first_entry(
- &priv->s_iowait.tx_head,
+ l,
struct sdma_txreq,
list);
list_del_init(&tx->list);
@@ -151,6 +161,14 @@
}
}
+static void flush_tx_list(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ flush_list_head(&iowait_get_ib_work(&priv->s_iowait)->tx_head);
+ flush_list_head(&iowait_get_tid_work(&priv->s_iowait)->tx_head);
+}
+
static void flush_iowait(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
@@ -279,41 +297,58 @@
priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc);
qp_set_16b(qp);
}
+
+ opfn_qp_init(qp, attr, attr_mask);
}
/**
- * hfi1_check_send_wqe - validate wqe
+ * hfi1_setup_wqe - set up the wqe
* @qp - The qp
* @wqe - The built wqe
+ * @call_send - Determine if the send should be posted or scheduled.
*
- * validate wqe. This is called
- * prior to inserting the wqe into
- * the ring but after the wqe has been
- * setup.
+ * Perform setup of the wqe. This is called
+ * prior to inserting the wqe into the ring but after
+ * the wqe has been setup by RDMAVT. This function
+ * allows the driver the opportunity to perform
+ * validation and additional setup of the wqe.
*
* Returns 0 on success, -EINVAL on failure
*
*/
-int hfi1_check_send_wqe(struct rvt_qp *qp,
- struct rvt_swqe *wqe)
+int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send)
{
struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
struct rvt_ah *ah;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_devdata *dd;
switch (qp->ibqp.qp_type) {
case IB_QPT_RC:
+ hfi1_setup_tid_rdma_wqe(qp, wqe);
+ /* fall through */
case IB_QPT_UC:
if (wqe->length > 0x80000000U)
return -EINVAL;
+ if (wqe->length > qp->pmtu)
+ *call_send = false;
break;
case IB_QPT_SMI:
- ah = ibah_to_rvtah(wqe->ud_wr.ah);
- if (wqe->length > (1 << ah->log_pmtu))
+ /*
+ * SM packets should exclusively use VL15 and their SL is
+ * ignored (IBTA v1.3, Section 3.5.8.2). Therefore, when ah
+ * is created, SL is 0 in most cases and as a result some
+ * fields (vl and pmtu) in ah may not be set correctly,
+ * depending on the SL2SC and SC2VL tables at the time.
+ */
+ ppd = ppd_from_ibp(ibp);
+ dd = dd_from_ppd(ppd);
+ if (wqe->length > dd->vld[15].mtu)
return -EINVAL;
break;
case IB_QPT_GSI:
case IB_QPT_UD:
- ah = ibah_to_rvtah(wqe->ud_wr.ah);
+ ah = rvt_get_swqe_ah(wqe);
if (wqe->length > (1 << ah->log_pmtu))
return -EINVAL;
if (ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)] == 0xf)
@@ -321,7 +356,14 @@
default:
break;
}
- return wqe->length <= piothreshold;
+
+ /*
+ * System latency between send and schedule is large enough that
+ * forcing call_send to true for piothreshold packets is necessary.
+ */
+ if (wqe->length <= piothreshold)
+ *call_send = true;
+ return 0;
}
/**
@@ -333,7 +375,7 @@
* It is only used in the post send, which doesn't hold
* the s_lock.
*/
-void _hfi1_schedule_send(struct rvt_qp *qp)
+bool _hfi1_schedule_send(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibport *ibp =
@@ -341,28 +383,26 @@
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
- iowait_schedule(&priv->s_iowait, ppd->hfi1_wq,
- priv->s_sde ?
- priv->s_sde->cpu :
- cpumask_first(cpumask_of_node(dd->node)));
+ return iowait_schedule(&priv->s_iowait, ppd->hfi1_wq,
+ priv->s_sde ?
+ priv->s_sde->cpu :
+ cpumask_first(cpumask_of_node(dd->node)));
}
static void qp_pio_drain(struct rvt_qp *qp)
{
- struct hfi1_ibdev *dev;
struct hfi1_qp_priv *priv = qp->priv;
if (!priv->s_sendcontext)
return;
- dev = to_idev(qp->ibqp.device);
while (iowait_pio_pending(&priv->s_iowait)) {
- write_seqlock_irq(&dev->iowait_lock);
+ write_seqlock_irq(&priv->s_sendcontext->waitlock);
hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 1);
- write_sequnlock_irq(&dev->iowait_lock);
+ write_sequnlock_irq(&priv->s_sendcontext->waitlock);
iowait_pio_drain(&priv->s_iowait);
- write_seqlock_irq(&dev->iowait_lock);
+ write_seqlock_irq(&priv->s_sendcontext->waitlock);
hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 0);
- write_sequnlock_irq(&dev->iowait_lock);
+ write_sequnlock_irq(&priv->s_sendcontext->waitlock);
}
}
@@ -372,12 +412,37 @@
*
* This schedules qp progress and caller should hold
* the s_lock.
+ * @return true if the first leg is scheduled;
+ * false if the first leg is not scheduled.
*/
-void hfi1_schedule_send(struct rvt_qp *qp)
+bool hfi1_schedule_send(struct rvt_qp *qp)
{
lockdep_assert_held(&qp->s_lock);
- if (hfi1_send_ok(qp))
+ if (hfi1_send_ok(qp)) {
_hfi1_schedule_send(qp);
+ return true;
+ }
+ if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+ iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait,
+ IOWAIT_PENDING_IB);
+ return false;
+}
+
+static void hfi1_qp_schedule(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ bool ret;
+
+ if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_IB)) {
+ ret = hfi1_schedule_send(qp);
+ if (ret)
+ iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
+ }
+ if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_TID)) {
+ ret = hfi1_schedule_tid_send(qp);
+ if (ret)
+ iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ }
}
void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag)
@@ -388,16 +453,41 @@
if (qp->s_flags & flag) {
qp->s_flags &= ~flag;
trace_hfi1_qpwakeup(qp, flag);
- hfi1_schedule_send(qp);
+ hfi1_qp_schedule(qp);
}
spin_unlock_irqrestore(&qp->s_lock, flags);
/* Notify hfi1_destroy_qp() if it is waiting. */
rvt_put_qp(qp);
}
+void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) {
+ qp->s_flags &= ~RVT_S_BUSY;
+ /*
+ * If we are sending a first-leg packet from the second leg,
+ * we need to clear the busy flag from priv->s_flags to
+ * avoid a race condition when the qp wakes up before
+ * the call to hfi1_verbs_send() returns to the second
+ * leg. In that case, the second leg will terminate without
+ * being re-scheduled, resulting in failure to send TID RDMA
+ * WRITE DATA and TID RDMA ACK packets.
+ */
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+ priv->s_flags &= ~(HFI1_S_TID_BUSY_SET |
+ RVT_S_BUSY);
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ }
+ } else {
+ priv->s_flags &= ~RVT_S_BUSY;
+ }
+}
+
static int iowait_sleep(
struct sdma_engine *sde,
- struct iowait *wait,
+ struct iowait_work *wait,
struct sdma_txreq *stx,
uint seq,
bool pkts_sent)
@@ -407,7 +497,6 @@
struct hfi1_qp_priv *priv;
unsigned long flags;
int ret = 0;
- struct hfi1_ibdev *dev;
qp = tx->qp;
priv = qp->priv;
@@ -420,9 +509,8 @@
* buffer and undoing the side effects of the copy.
*/
/* Make a common routine? */
- dev = &sde->dd->verbs_dev;
list_add_tail(&stx->list, &wait->tx_head);
- write_seqlock(&dev->iowait_lock);
+ write_seqlock(&sde->waitlock);
if (sdma_progress(sde, seq, stx))
goto eagain;
if (list_empty(&priv->s_iowait.list)) {
@@ -431,14 +519,15 @@
ibp->rvp.n_dmawait++;
qp->s_flags |= RVT_S_WAIT_DMA_DESC;
+ iowait_get_priority(&priv->s_iowait);
iowait_queue(pkts_sent, &priv->s_iowait,
&sde->dmawait);
- priv->s_iowait.lock = &dev->iowait_lock;
+ priv->s_iowait.lock = &sde->waitlock;
trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC);
rvt_get_qp(qp);
}
- write_sequnlock(&dev->iowait_lock);
- qp->s_flags &= ~RVT_S_BUSY;
+ write_sequnlock(&sde->waitlock);
+ hfi1_qp_unbusy(qp, wait);
spin_unlock_irqrestore(&qp->s_lock, flags);
ret = -EBUSY;
} else {
@@ -447,7 +536,7 @@
}
return ret;
eagain:
- write_sequnlock(&dev->iowait_lock);
+ write_sequnlock(&sde->waitlock);
spin_unlock_irqrestore(&qp->s_lock, flags);
list_del_init(&stx->list);
return -EAGAIN;
@@ -480,6 +569,17 @@
spin_unlock_irqrestore(&qp->s_lock, flags);
}
+static void hfi1_init_priority(struct iowait *w)
+{
+ struct rvt_qp *qp = iowait_to_qp(w);
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (qp->s_flags & RVT_S_ACK_PENDING)
+ w->priority++;
+ if (priv->s_flags & RVT_S_ACK_PENDING)
+ w->priority++;
+}
+
/**
* qp_to_sdma_engine - map a qp to a send engine
* @qp: the QP
@@ -602,8 +702,8 @@
sde ? sde->this_idx : 0,
send_context,
send_context ? send_context->sw_index : 0,
- ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head,
- ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail,
+ ib_cq_head(qp->ibqp.send_cq),
+ ib_cq_tail(qp->ibqp.send_cq),
qp->pid,
qp->s_state,
qp->s_ack_state,
@@ -637,9 +737,13 @@
&priv->s_iowait,
1,
_hfi1_do_send,
+ _hfi1_do_tid_send,
iowait_sleep,
iowait_wakeup,
- iowait_sdma_drained);
+ iowait_sdma_drained,
+ hfi1_init_priority);
+ /* Init to a value to start the running average correctly */
+ priv->s_running_pkt_size = piothreshold / 2;
return priv;
}
@@ -647,6 +751,7 @@
{
struct hfi1_qp_priv *priv = qp->priv;
+ hfi1_qp_priv_tid_free(rdi, qp);
kfree(priv->s_ahg);
kfree(priv);
}
@@ -680,19 +785,24 @@
{
lockdep_assert_held(&qp->s_lock);
flush_iowait(qp);
+ hfi1_tid_rdma_flush_wait(qp);
}
void stop_send_queue(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
- cancel_work_sync(&priv->s_iowait.iowork);
+ iowait_cancel_work(&priv->s_iowait);
+ if (cancel_work_sync(&priv->tid_rdma.trigger_work))
+ rvt_put_qp(qp);
}
void quiesce_qp(struct rvt_qp *qp)
{
struct hfi1_qp_priv *priv = qp->priv;
+ hfi1_del_tid_reap_timer(qp);
+ hfi1_del_tid_retry_timer(qp);
iowait_sdma_drain(&priv->s_iowait);
qp_pio_drain(qp);
flush_tx_list(qp);
@@ -700,8 +810,13 @@
void notify_qp_reset(struct rvt_qp *qp)
{
+ hfi1_qp_kern_exp_rcv_clear_all(qp);
qp->r_adefered = 0;
clear_ahg(qp);
+
+ /* Clear any OPFN state */
+ if (qp->ibqp.qp_type == IB_QPT_RC)
+ opfn_conn_error(qp);
}
/*
@@ -783,8 +898,11 @@
if (lock) {
write_seqlock(lock);
if (!list_empty(&priv->s_iowait.list) &&
- !(qp->s_flags & RVT_S_BUSY)) {
- qp->s_flags &= ~RVT_S_ANY_WAIT_IO;
+ !(qp->s_flags & RVT_S_BUSY) &&
+ !(priv->s_flags & RVT_S_BUSY)) {
+ qp->s_flags &= ~HFI1_S_ANY_WAIT_IO;
+ iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
+ iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
list_del_init(&priv->s_iowait.list);
priv->s_iowait.lock = NULL;
rvt_put_qp(qp);
@@ -792,7 +910,8 @@
write_sequnlock(lock);
}
- if (!(qp->s_flags & RVT_S_BUSY)) {
+ if (!(qp->s_flags & RVT_S_BUSY) && !(priv->s_flags & RVT_S_BUSY)) {
+ qp->s_hdrwords = 0;
if (qp->s_rdma_mr) {
rvt_put_mr(qp->s_rdma_mr);
qp->s_rdma_mr = NULL;
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
index 078cff7..b670321 100644
--- a/drivers/infiniband/hw/hfi1/qp.h
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -58,28 +58,22 @@
extern const struct rvt_operation_params hfi1_post_parms[];
/*
- * Send if not busy or waiting for I/O and either
- * a RC response is pending or we can process send work requests.
- */
-static inline int hfi1_send_ok(struct rvt_qp *qp)
-{
- return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) &&
- (verbs_txreq_queued(qp) ||
- (qp->s_flags & RVT_S_RESP_PENDING) ||
- !(qp->s_flags & RVT_S_ANY_WAIT_SEND));
-}
-
-/*
* Driver specific s_flags starting at bit 31 down to HFI1_S_MIN_BIT_MASK
*
* HFI1_S_AHG_VALID - ahg header valid on chip
* HFI1_S_AHG_CLEAR - have send engine clear ahg state
* HFI1_S_WAIT_PIO_DRAIN - qp waiting for PIOs to drain
+ * HFI1_S_WAIT_TID_SPACE - a QP is waiting for TID resource
+ * HFI1_S_WAIT_TID_RESP - waiting for a TID RDMA WRITE response
+ * HFI1_S_WAIT_HALT - halt the first leg send engine
* HFI1_S_MIN_BIT_MASK - the lowest bit that can be used by hfi1
*/
#define HFI1_S_AHG_VALID 0x80000000
#define HFI1_S_AHG_CLEAR 0x40000000
#define HFI1_S_WAIT_PIO_DRAIN 0x20000000
+#define HFI1_S_WAIT_TID_SPACE 0x10000000
+#define HFI1_S_WAIT_TID_RESP 0x08000000
+#define HFI1_S_WAIT_HALT 0x04000000
#define HFI1_S_MIN_BIT_MASK 0x01000000
/*
@@ -88,6 +82,21 @@
#define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN)
#define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND)
+#define HFI1_S_ANY_TID_WAIT_SEND (RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_DMA)
+
+/*
+ * Send if not busy or waiting for I/O and either
+ * a RC response is pending or we can process send work requests.
+ */
+static inline int hfi1_send_ok(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ return !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)) &&
+ (verbs_txreq_queued(iowait_get_ib_work(&priv->s_iowait)) ||
+ (qp->s_flags & RVT_S_RESP_PENDING) ||
+ !(qp->s_flags & RVT_S_ANY_WAIT_SEND));
+}
/*
* free_ahg - clear ahg from QP
@@ -129,8 +138,8 @@
void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter);
-void _hfi1_schedule_send(struct rvt_qp *qp);
-void hfi1_schedule_send(struct rvt_qp *qp);
+bool _hfi1_schedule_send(struct rvt_qp *qp);
+bool hfi1_schedule_send(struct rvt_qp *qp);
void hfi1_migrate_qp(struct rvt_qp *qp);
@@ -150,4 +159,5 @@
u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu);
int mtu_to_path_mtu(u32 mtu);
void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl);
+void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait);
#endif /* _QP_H */
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 9bd63ab..1a3c647 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -51,24 +51,48 @@
#include "hfi.h"
#include "qp.h"
+#include "rc.h"
#include "verbs_txreq.h"
#include "trace.h"
-/* cut down ridiculously long IB macro names */
-#define OP(x) RC_OP(x)
-
-static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
- u32 psn, u32 pmtu)
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
+ u8 *prev_ack, bool *scheduled)
+ __must_hold(&qp->s_lock)
{
- u32 len;
+ struct rvt_ack_entry *e = NULL;
+ u8 i, p;
+ bool s = true;
- len = delta_psn(psn, wqe->psn) * pmtu;
- ss->sge = wqe->sg_list[0];
- ss->sg_list = wqe->sg_list + 1;
- ss->num_sge = wqe->wr.num_sge;
- ss->total_len = wqe->length;
- rvt_skip_sge(ss, len, false);
- return wqe->length - len;
+ for (i = qp->r_head_ack_queue; ; i = p) {
+ if (i == qp->s_tail_ack_queue)
+ s = false;
+ if (i)
+ p = i - 1;
+ else
+ p = rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
+ if (p == qp->r_head_ack_queue) {
+ e = NULL;
+ break;
+ }
+ e = &qp->s_ack_queue[p];
+ if (!e->opcode) {
+ e = NULL;
+ break;
+ }
+ if (cmp_psn(psn, e->psn) >= 0) {
+ if (p == qp->s_tail_ack_queue &&
+ cmp_psn(psn, e->lpsn) <= 0)
+ s = false;
+ break;
+ }
+ }
+ if (prev)
+ *prev = p;
+ if (prev_ack)
+ *prev_ack = i;
+ if (scheduled)
+ *scheduled = s;
+ return e;
}
/**
@@ -87,20 +111,25 @@
struct hfi1_pkt_state *ps)
{
struct rvt_ack_entry *e;
- u32 hwords;
- u32 len;
- u32 bth0;
- u32 bth2;
+ u32 hwords, hdrlen;
+ u32 len = 0;
+ u32 bth0 = 0, bth2 = 0;
+ u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
int middle = 0;
u32 pmtu = qp->pmtu;
- struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ bool last_pkt;
+ u32 delta;
+ u8 next = qp->s_tail_ack_queue;
+ struct tid_rdma_request *req;
+ trace_hfi1_rsp_make_rc_ack(qp, 0);
lockdep_assert_held(&qp->s_lock);
/* Don't send an ACK if we aren't supposed to. */
if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
goto bail;
- if (priv->hdr_type == HFI1_PKT_TYPE_9B)
+ if (qpriv->hdr_type == HFI1_PKT_TYPE_9B)
/* header size in 32-bit words LRH+BTH = (8+12)/4. */
hwords = 5;
else
@@ -111,10 +140,7 @@
case OP(RDMA_READ_RESPONSE_LAST):
case OP(RDMA_READ_RESPONSE_ONLY):
e = &qp->s_ack_queue[qp->s_tail_ack_queue];
- if (e->rdma_sge.mr) {
- rvt_put_mr(e->rdma_sge.mr);
- e->rdma_sge.mr = NULL;
- }
+ release_rdma_sge_mr(e);
/* FALLTHROUGH */
case OP(ATOMIC_ACKNOWLEDGE):
/*
@@ -122,8 +148,18 @@
* response has been sent instead of only being
* constructed.
*/
- if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC)
- qp->s_tail_ack_queue = 0;
+ if (++next > rvt_size_atomic(&dev->rdi))
+ next = 0;
+ /*
+ * Only advance the s_acked_ack_queue pointer if there
+ * have been no TID RDMA requests.
+ */
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ if (e->opcode != TID_OP(WRITE_REQ) &&
+ qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue = next;
+ qp->s_tail_ack_queue = next;
+ trace_hfi1_rsp_make_rc_ack(qp, e->psn);
/* FALLTHROUGH */
case OP(SEND_ONLY):
case OP(ACKNOWLEDGE):
@@ -135,6 +171,12 @@
}
e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ /* Check for tid write fence */
+ if ((qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK) ||
+ hfi1_tid_rdma_ack_interlock(qp, e)) {
+ iowait_set_flag(&qpriv->s_iowait, IOWAIT_PENDING_IB);
+ goto bail;
+ }
if (e->opcode == OP(RDMA_READ_REQUEST)) {
/*
* If a RDMA read response is being resent and
@@ -144,6 +186,10 @@
*/
len = e->rdma_sge.sge_length;
if (len && !e->rdma_sge.mr) {
+ if (qp->s_acked_ack_queue ==
+ qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue =
+ qp->r_head_ack_queue;
qp->s_tail_ack_queue = qp->r_head_ack_queue;
goto bail;
}
@@ -165,6 +211,45 @@
hwords++;
qp->s_ack_rdma_psn = e->psn;
bth2 = mask_psn(qp->s_ack_rdma_psn++);
+ } else if (e->opcode == TID_OP(WRITE_REQ)) {
+ /*
+ * If a TID RDMA WRITE RESP is being resent, we have to
+ * wait for the actual request. All requests that are to
+ * be resent will have their state set to
+ * TID_REQUEST_RESEND. When the new request arrives, the
+ * state will be changed to TID_REQUEST_RESEND_ACTIVE.
+ */
+ req = ack_to_tid_req(e);
+ if (req->state == TID_REQUEST_RESEND ||
+ req->state == TID_REQUEST_INIT_RESEND)
+ goto bail;
+ qp->s_ack_state = TID_OP(WRITE_RESP);
+ qp->s_ack_rdma_psn = mask_psn(e->psn + req->cur_seg);
+ goto write_resp;
+ } else if (e->opcode == TID_OP(READ_REQ)) {
+ /*
+ * If a TID RDMA read response is being resent and
+ * we haven't seen the duplicate request yet,
+ * then stop sending the remaining responses the
+ * responder has seen until the requester re-sends it.
+ */
+ len = e->rdma_sge.sge_length;
+ if (len && !e->rdma_sge.mr) {
+ if (qp->s_acked_ack_queue ==
+ qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue =
+ qp->r_head_ack_queue;
+ qp->s_tail_ack_queue = qp->r_head_ack_queue;
+ goto bail;
+ }
+ /* Copy SGE state in case we need to resend */
+ ps->s_txreq->mr = e->rdma_sge.mr;
+ if (ps->s_txreq->mr)
+ rvt_get_mr(ps->s_txreq->mr);
+ qp->s_ack_rdma_sge.sge = e->rdma_sge;
+ qp->s_ack_rdma_sge.num_sge = 1;
+ qp->s_ack_state = TID_OP(READ_RESP);
+ goto read_resp;
} else {
/* COMPARE_SWAP or FETCH_ADD */
ps->s_txreq->ss = NULL;
@@ -176,6 +261,7 @@
bth2 = mask_psn(e->psn);
e->sent = 1;
}
+ trace_hfi1_tid_write_rsp_make_rc_ack(qp);
bth0 = qp->s_ack_state << 24;
break;
@@ -202,6 +288,84 @@
bth2 = mask_psn(qp->s_ack_rdma_psn++);
break;
+ case TID_OP(WRITE_RESP):
+write_resp:
+ /*
+ * 1. Check if RVT_S_ACK_PENDING is set. If yes,
+ * goto normal.
+ * 2. Attempt to allocate TID resources.
+ * 3. Remove RVT_S_RESP_PENDING flags from s_flags
+ * 4. If resources not available:
+ * 4.1 Set RVT_S_WAIT_TID_SPACE
+ * 4.2 Queue QP on RCD TID queue
+ * 4.3 Put QP on iowait list.
+ * 4.4 Build IB RNR NAK with appropriate timeout value
+ * 4.5 Return indication progress made.
+ * 5. If resources are available:
+ * 5.1 Program HW flow CSRs
+ * 5.2 Build TID RDMA WRITE RESP packet
+ * 5.3 If more resources needed, do 2.1 - 2.3.
+ * 5.4 Wake up next QP on RCD TID queue.
+ * 5.5 Return indication progress made.
+ */
+
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ req = ack_to_tid_req(e);
+
+ /*
+ * Send scheduled RNR NAK's. RNR NAK's need to be sent at
+ * segment boundaries, not at request boundaries. Don't change
+ * s_ack_state because we are still in the middle of a request
+ */
+ if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND &&
+ qp->s_tail_ack_queue == qpriv->r_tid_alloc &&
+ req->cur_seg == req->alloc_seg) {
+ qpriv->rnr_nak_state = TID_RNR_NAK_SENT;
+ goto normal_no_state;
+ }
+
+ bth2 = mask_psn(qp->s_ack_rdma_psn);
+ hdrlen = hfi1_build_tid_rdma_write_resp(qp, e, ohdr, &bth1,
+ bth2, &len,
+ &ps->s_txreq->ss);
+ if (!hdrlen)
+ return 0;
+
+ hwords += hdrlen;
+ bth0 = qp->s_ack_state << 24;
+ qp->s_ack_rdma_psn++;
+ trace_hfi1_tid_req_make_rc_ack_write(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ if (req->cur_seg != req->total_segs)
+ break;
+
+ e->sent = 1;
+ /* Do not free e->rdma_sge until all data are received */
+ qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+ break;
+
+ case TID_OP(READ_RESP):
+read_resp:
+ e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ ps->s_txreq->ss = &qp->s_ack_rdma_sge;
+ delta = hfi1_build_tid_rdma_read_resp(qp, e, ohdr, &bth0,
+ &bth1, &bth2, &len,
+ &last_pkt);
+ if (delta == 0)
+ goto error_qp;
+ hwords += delta;
+ if (last_pkt) {
+ e->sent = 1;
+ /*
+ * Increment qp->s_tail_ack_queue through s_ack_state
+ * transition.
+ */
+ qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+ }
+ break;
+ case TID_OP(READ_REQ):
+ goto bail;
+
default:
normal:
/*
@@ -211,8 +375,7 @@
* (see above).
*/
qp->s_ack_state = OP(SEND_ONLY);
- qp->s_flags &= ~RVT_S_ACK_PENDING;
- ps->s_txreq->ss = NULL;
+normal_no_state:
if (qp->s_nak_state)
ohdr->u.aeth =
cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
@@ -224,14 +387,24 @@
len = 0;
bth0 = OP(ACKNOWLEDGE) << 24;
bth2 = mask_psn(qp->s_ack_psn);
+ qp->s_flags &= ~RVT_S_ACK_PENDING;
+ ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
+ ps->s_txreq->ss = NULL;
}
qp->s_rdma_ack_cnt++;
- ps->s_txreq->sde = priv->s_sde;
+ ps->s_txreq->sde = qpriv->s_sde;
ps->s_txreq->s_cur_size = len;
ps->s_txreq->hdr_dwords = hwords;
- hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps);
+ hfi1_make_ruc_header(qp, ohdr, bth0, bth1, bth2, middle, ps);
return 1;
-
+error_qp:
+ spin_unlock_irqrestore(&qp->s_lock, ps->flags);
+ spin_lock_irqsave(&qp->r_lock, ps->flags);
+ spin_lock(&qp->s_lock);
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irqrestore(&qp->r_lock, ps->flags);
+ spin_lock_irqsave(&qp->s_lock, ps->flags);
bail:
qp->s_ack_state = OP(ACKNOWLEDGE);
/*
@@ -258,17 +431,23 @@
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
struct ib_other_headers *ohdr;
- struct rvt_sge_state *ss;
+ struct rvt_sge_state *ss = NULL;
struct rvt_swqe *wqe;
- u32 hwords;
- u32 len;
- u32 bth0 = 0;
- u32 bth2;
+ struct hfi1_swqe_priv *wpriv;
+ struct tid_rdma_request *req = NULL;
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ u32 hwords = 5;
+ u32 len = 0;
+ u32 bth0 = 0, bth2 = 0;
+ u32 bth1 = qp->remote_qpn | (HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT);
u32 pmtu = qp->pmtu;
char newreq;
int middle = 0;
int delta;
+ struct tid_rdma_flow *flow = NULL;
+ struct tid_rdma_params *remote;
+ trace_hfi1_sender_make_rc_req(qp);
lockdep_assert_held(&qp->s_lock);
ps->s_txreq = get_txreq(ps->dev, qp);
if (!ps->s_txreq)
@@ -309,13 +488,13 @@
}
clear_ahg(qp);
wqe = rvt_get_swqe_ptr(qp, qp->s_last);
- hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
- IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+ hfi1_trdma_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+ IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
/* will get called again */
goto done_free_tx;
}
- if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK))
+ if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK | HFI1_S_WAIT_HALT))
goto bail;
if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) {
@@ -329,6 +508,7 @@
/* Send a request. */
wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
+check_s_state:
switch (qp->s_state) {
default:
if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK))
@@ -350,9 +530,13 @@
/*
* If a fence is requested, wait for previous
* RDMA read and atomic operations to finish.
+ * However, there is no need to guard against
+ * TID RDMA READ after TID RDMA READ.
*/
if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
- qp->s_num_rd_atomic) {
+ qp->s_num_rd_atomic &&
+ (wqe->wr.opcode != IB_WR_TID_RDMA_READ ||
+ priv->pending_tid_r_segs < qp->s_num_rd_atomic)) {
qp->s_flags |= RVT_S_WAIT_FENCE;
goto bail;
}
@@ -378,9 +562,9 @@
wqe->wr.ex.invalidate_rkey);
local_ops = 1;
}
- hfi1_send_complete(qp, wqe,
- err ? IB_WC_LOC_PROT_ERR
- : IB_WC_SUCCESS);
+ rvt_send_complete(qp, wqe,
+ err ? IB_WC_LOC_PROT_ERR
+ : IB_WC_SUCCESS);
if (local_ops)
atomic_dec(&qp->local_ops_pending);
goto done_free_tx;
@@ -397,16 +581,22 @@
len = wqe->length;
ss = &qp->s_sge;
bth2 = mask_psn(qp->s_psn);
+
+ /*
+ * Interlock between various IB requests and TID RDMA
+ * if necessary.
+ */
+ if ((priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) ||
+ hfi1_tid_rdma_wqe_interlock(qp, wqe))
+ goto bail;
+
switch (wqe->wr.opcode) {
case IB_WR_SEND:
case IB_WR_SEND_WITH_IMM:
case IB_WR_SEND_WITH_INV:
/* If no credit, return. */
- if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
- rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
- qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+ if (!rvt_rc_credit_avail(qp, wqe))
goto bail;
- }
if (len > pmtu) {
qp->s_state = OP(SEND_FIRST);
len = pmtu;
@@ -439,11 +629,8 @@
goto no_flow_control;
case IB_WR_RDMA_WRITE_WITH_IMM:
/* If no credit, return. */
- if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) &&
- rvt_cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) {
- qp->s_flags |= RVT_S_WAIT_SSN_CREDIT;
+ if (!rvt_rc_credit_avail(qp, wqe))
goto bail;
- }
no_flow_control:
put_ib_reth_vaddr(
wqe->rdma_wr.remote_addr,
@@ -473,21 +660,126 @@
qp->s_cur = 0;
break;
+ case IB_WR_TID_RDMA_WRITE:
+ if (newreq) {
+ /*
+ * Limit the number of TID RDMA WRITE requests.
+ */
+ if (atomic_read(&priv->n_tid_requests) >=
+ HFI1_TID_RDMA_WRITE_CNT)
+ goto bail;
+
+ if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ }
+
+ hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr,
+ &bth1, &bth2,
+ &len);
+ ss = NULL;
+ if (priv->s_tid_cur == HFI1_QP_WQE_INVALID) {
+ priv->s_tid_cur = qp->s_cur;
+ if (priv->s_tid_tail == HFI1_QP_WQE_INVALID) {
+ priv->s_tid_tail = qp->s_cur;
+ priv->s_state = TID_OP(WRITE_RESP);
+ }
+ } else if (priv->s_tid_cur == priv->s_tid_head) {
+ struct rvt_swqe *__w;
+ struct tid_rdma_request *__r;
+
+ __w = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
+ __r = wqe_to_tid_req(__w);
+
+ /*
+ * The s_tid_cur pointer is advanced to s_cur if
+ * any of the following conditions about the WQE
+ * to which s_ti_cur currently points to are
+ * satisfied:
+ * 1. The request is not a TID RDMA WRITE
+ * request,
+ * 2. The request is in the INACTIVE or
+ * COMPLETE states (TID RDMA READ requests
+ * stay at INACTIVE and TID RDMA WRITE
+ * transition to COMPLETE when done),
+ * 3. The request is in the ACTIVE or SYNC
+ * state and the number of completed
+ * segments is equal to the total segment
+ * count.
+ * (If ACTIVE, the request is waiting for
+ * ACKs. If SYNC, the request has not
+ * received any responses because it's
+ * waiting on a sync point.)
+ */
+ if (__w->wr.opcode != IB_WR_TID_RDMA_WRITE ||
+ __r->state == TID_REQUEST_INACTIVE ||
+ __r->state == TID_REQUEST_COMPLETE ||
+ ((__r->state == TID_REQUEST_ACTIVE ||
+ __r->state == TID_REQUEST_SYNC) &&
+ __r->comp_seg == __r->total_segs)) {
+ if (priv->s_tid_tail ==
+ priv->s_tid_cur &&
+ priv->s_state ==
+ TID_OP(WRITE_DATA_LAST)) {
+ priv->s_tid_tail = qp->s_cur;
+ priv->s_state =
+ TID_OP(WRITE_RESP);
+ }
+ priv->s_tid_cur = qp->s_cur;
+ }
+ /*
+ * A corner case: when the last TID RDMA WRITE
+ * request was completed, s_tid_head,
+ * s_tid_cur, and s_tid_tail all point to the
+ * same location. Other requests are posted and
+ * s_cur wraps around to the same location,
+ * where a new TID RDMA WRITE is posted. In
+ * this case, none of the indices need to be
+ * updated. However, the priv->s_state should.
+ */
+ if (priv->s_tid_tail == qp->s_cur &&
+ priv->s_state == TID_OP(WRITE_DATA_LAST))
+ priv->s_state = TID_OP(WRITE_RESP);
+ }
+ req = wqe_to_tid_req(wqe);
+ if (newreq) {
+ priv->s_tid_head = qp->s_cur;
+ priv->pending_tid_w_resp += req->total_segs;
+ atomic_inc(&priv->n_tid_requests);
+ atomic_dec(&priv->n_requests);
+ } else {
+ req->state = TID_REQUEST_RESEND;
+ req->comp_seg = delta_psn(bth2, wqe->psn);
+ /*
+ * Pull back any segments since we are going
+ * to re-receive them.
+ */
+ req->setup_head = req->clear_tail;
+ priv->pending_tid_w_resp +=
+ delta_psn(wqe->lpsn, bth2) + 1;
+ }
+
+ trace_hfi1_tid_write_sender_make_req(qp, newreq);
+ trace_hfi1_tid_req_make_req_write(qp, newreq,
+ wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ req);
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
case IB_WR_RDMA_READ:
/*
* Don't allow more operations to be started
* than the QP limits allow.
*/
- if (newreq) {
- if (qp->s_num_rd_atomic >=
- qp->s_max_rd_atomic) {
- qp->s_flags |= RVT_S_WAIT_RDMAR;
- goto bail;
- }
- qp->s_num_rd_atomic++;
- if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
- qp->s_lsn++;
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
}
+ qp->s_num_rd_atomic++;
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
put_ib_reth_vaddr(
wqe->rdma_wr.remote_addr,
&ohdr->u.rc.reth);
@@ -503,23 +795,99 @@
qp->s_cur = 0;
break;
+ case IB_WR_TID_RDMA_READ:
+ trace_hfi1_tid_read_sender_make_req(qp, newreq);
+ wpriv = wqe->priv;
+ req = wqe_to_tid_req(wqe);
+ trace_hfi1_tid_req_make_req_read(qp, newreq,
+ wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ req);
+ delta = cmp_psn(qp->s_psn, wqe->psn);
+
+ /*
+ * Don't allow more operations to be started
+ * than the QP limits allow. We could get here under
+ * three conditions; (1) It's a new request; (2) We are
+ * sending the second or later segment of a request,
+ * but the qp->s_state is set to OP(RDMA_READ_REQUEST)
+ * when the last segment of a previous request is
+ * received just before this; (3) We are re-sending a
+ * request.
+ */
+ if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
+ }
+ if (newreq) {
+ struct tid_rdma_flow *flow =
+ &req->flows[req->setup_head];
+
+ /*
+ * Set up s_sge as it is needed for TID
+ * allocation. However, if the pages have been
+ * walked and mapped, skip it. An earlier try
+ * has failed to allocate the TID entries.
+ */
+ if (!flow->npagesets) {
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+ qp->s_len = wqe->length;
+ req->isge = 0;
+ req->clear_tail = req->setup_head;
+ req->flow_idx = req->setup_head;
+ req->state = TID_REQUEST_ACTIVE;
+ }
+ } else if (delta == 0) {
+ /* Re-send a request */
+ req->cur_seg = 0;
+ req->comp_seg = 0;
+ req->ack_pending = 0;
+ req->flow_idx = req->clear_tail;
+ req->state = TID_REQUEST_RESEND;
+ }
+ req->s_next_psn = qp->s_psn;
+ /* Read one segment at a time */
+ len = min_t(u32, req->seg_len,
+ wqe->length - req->seg_len * req->cur_seg);
+ delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr,
+ &bth1, &bth2,
+ &len);
+ if (delta <= 0) {
+ /* Wait for TID space */
+ goto bail;
+ }
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ hwords += delta;
+ ss = &wpriv->ss;
+ /* Check if this is the last segment */
+ if (req->cur_seg >= req->total_segs &&
+ ++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ break;
+
case IB_WR_ATOMIC_CMP_AND_SWP:
case IB_WR_ATOMIC_FETCH_AND_ADD:
/*
* Don't allow more operations to be started
* than the QP limits allow.
*/
- if (newreq) {
- if (qp->s_num_rd_atomic >=
- qp->s_max_rd_atomic) {
- qp->s_flags |= RVT_S_WAIT_RDMAR;
- goto bail;
- }
- qp->s_num_rd_atomic++;
- if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
- qp->s_lsn++;
+ if (qp->s_num_rd_atomic >=
+ qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
}
- if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+ qp->s_num_rd_atomic++;
+
+ /* FALLTHROUGH */
+ case IB_WR_OPFN:
+ if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
+ qp->s_lsn++;
+ if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_OPFN) {
qp->s_state = OP(COMPARE_SWAP);
put_ib_ateth_swap(wqe->atomic_wr.swap,
&ohdr->u.atomic_eth);
@@ -546,18 +914,23 @@
default:
goto bail;
}
- qp->s_sge.sge = wqe->sg_list[0];
- qp->s_sge.sg_list = wqe->sg_list + 1;
- qp->s_sge.num_sge = wqe->wr.num_sge;
- qp->s_sge.total_len = wqe->length;
- qp->s_len = wqe->length;
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ) {
+ qp->s_sge.sge = wqe->sg_list[0];
+ qp->s_sge.sg_list = wqe->sg_list + 1;
+ qp->s_sge.num_sge = wqe->wr.num_sge;
+ qp->s_sge.total_len = wqe->length;
+ qp->s_len = wqe->length;
+ }
if (newreq) {
qp->s_tail++;
if (qp->s_tail >= qp->s_size)
qp->s_tail = 0;
}
- if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
qp->s_psn = wqe->lpsn + 1;
+ else if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ qp->s_psn = req->s_next_psn;
else
qp->s_psn++;
break;
@@ -674,10 +1047,137 @@
if (qp->s_cur == qp->s_size)
qp->s_cur = 0;
break;
+
+ case TID_OP(WRITE_RESP):
+ /*
+ * This value for s_state is used for restarting a TID RDMA
+ * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE
+ * for more).
+ */
+ req = wqe_to_tid_req(wqe);
+ req->state = TID_REQUEST_RESEND;
+ rcu_read_lock();
+ remote = rcu_dereference(priv->tid_rdma.remote);
+ req->comp_seg = delta_psn(qp->s_psn, wqe->psn);
+ len = wqe->length - (req->comp_seg * remote->max_len);
+ rcu_read_unlock();
+
+ bth2 = mask_psn(qp->s_psn);
+ hwords += hfi1_build_tid_rdma_write_req(qp, wqe, ohdr, &bth1,
+ &bth2, &len);
+ qp->s_psn = wqe->lpsn + 1;
+ ss = NULL;
+ qp->s_state = TID_OP(WRITE_REQ);
+ priv->pending_tid_w_resp += delta_psn(wqe->lpsn, bth2) + 1;
+ priv->s_tid_cur = qp->s_cur;
+ if (++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ trace_hfi1_tid_req_make_req_write(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ break;
+
+ case TID_OP(READ_RESP):
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+ goto bail;
+ /* This is used to restart a TID read request */
+ req = wqe_to_tid_req(wqe);
+ wpriv = wqe->priv;
+ /*
+ * Back down. The field qp->s_psn has been set to the psn with
+ * which the request should be restart. It's OK to use division
+ * as this is on the retry path.
+ */
+ req->cur_seg = delta_psn(qp->s_psn, wqe->psn) / priv->pkts_ps;
+
+ /*
+ * The following function need to be redefined to return the
+ * status to make sure that we find the flow. At the same
+ * time, we can use the req->state change to check if the
+ * call succeeds or not.
+ */
+ req->state = TID_REQUEST_RESEND;
+ hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
+ if (req->state != TID_REQUEST_ACTIVE) {
+ /*
+ * Failed to find the flow. Release all allocated tid
+ * resources.
+ */
+ hfi1_kern_exp_rcv_clear_all(req);
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+
+ hfi1_trdma_send_complete(qp, wqe, IB_WC_LOC_QP_OP_ERR);
+ goto bail;
+ }
+ req->state = TID_REQUEST_RESEND;
+ len = min_t(u32, req->seg_len,
+ wqe->length - req->seg_len * req->cur_seg);
+ flow = &req->flows[req->flow_idx];
+ len -= flow->sent;
+ req->s_next_psn = flow->flow_state.ib_lpsn + 1;
+ delta = hfi1_build_tid_rdma_read_packet(wqe, ohdr, &bth1,
+ &bth2, &len);
+ if (delta <= 0) {
+ /* Wait for TID space */
+ goto bail;
+ }
+ hwords += delta;
+ ss = &wpriv->ss;
+ /* Check if this is the last segment */
+ if (req->cur_seg >= req->total_segs &&
+ ++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ qp->s_psn = req->s_next_psn;
+ trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ break;
+ case TID_OP(READ_REQ):
+ req = wqe_to_tid_req(wqe);
+ delta = cmp_psn(qp->s_psn, wqe->psn);
+ /*
+ * If the current WR is not TID RDMA READ, or this is the start
+ * of a new request, we need to change the qp->s_state so that
+ * the request can be set up properly.
+ */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ || delta == 0 ||
+ qp->s_cur == qp->s_tail) {
+ qp->s_state = OP(RDMA_READ_REQUEST);
+ if (delta == 0 || qp->s_cur == qp->s_tail)
+ goto check_s_state;
+ else
+ goto bail;
+ }
+
+ /* Rate limiting */
+ if (qp->s_num_rd_atomic >= qp->s_max_rd_atomic) {
+ qp->s_flags |= RVT_S_WAIT_RDMAR;
+ goto bail;
+ }
+
+ wpriv = wqe->priv;
+ /* Read one segment at a time */
+ len = min_t(u32, req->seg_len,
+ wqe->length - req->seg_len * req->cur_seg);
+ delta = hfi1_build_tid_rdma_read_req(qp, wqe, ohdr, &bth1,
+ &bth2, &len);
+ if (delta <= 0) {
+ /* Wait for TID space */
+ goto bail;
+ }
+ hwords += delta;
+ ss = &wpriv->ss;
+ /* Check if this is the last segment */
+ if (req->cur_seg >= req->total_segs &&
+ ++qp->s_cur == qp->s_size)
+ qp->s_cur = 0;
+ qp->s_psn = req->s_next_psn;
+ trace_hfi1_tid_req_make_req_read(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ break;
}
qp->s_sending_hpsn = bth2;
delta = delta_psn(bth2, wqe->psn);
- if (delta && delta % HFI1_PSN_CREDIT == 0)
+ if (delta && delta % HFI1_PSN_CREDIT == 0 &&
+ wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
bth2 |= IB_BTH_REQ_ACK;
if (qp->s_flags & RVT_S_SEND_ONE) {
qp->s_flags &= ~RVT_S_SEND_ONE;
@@ -693,6 +1193,7 @@
qp,
ohdr,
bth0 | (qp->s_state << 24),
+ bth1,
bth2,
middle,
ps);
@@ -709,6 +1210,12 @@
bail_no_tx:
ps->s_txreq = NULL;
qp->s_flags &= ~RVT_S_BUSY;
+ /*
+ * If we didn't get a txreq, the QP will be woken up later to try
+ * again. Set the flags to indicate which work item to wake
+ * up.
+ */
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
return 0;
}
@@ -796,6 +1303,11 @@
if (qp->s_mig_state == IB_MIG_MIGRATED)
bth0 |= IB_BTH_MIG_REQ;
bth1 = (!!is_fecn) << IB_BECN_SHIFT;
+ /*
+ * Inline ACKs go out without the use of the Verbs send engine, so
+ * we need to set the STL Verbs Extended bit here
+ */
+ bth1 |= HFI1_CAP_IS_KSET(OPFN) << IB_BTHE_E_SHIFT;
hfi1_make_bth_aeth(qp, ohdr, bth0, bth1);
}
@@ -914,7 +1426,7 @@
pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps,
sc_to_vlt(ppd->dd, sc5), plen);
pbuf = sc_buffer_alloc(rcd->sc, plen, NULL, NULL);
- if (!pbuf) {
+ if (IS_ERR_OR_NULL(pbuf)) {
/*
* We have no room to send at the moment. Pass
* responsibility for sending the ACK to the send engine
@@ -936,6 +1448,48 @@
}
/**
+ * update_num_rd_atomic - update the qp->s_num_rd_atomic
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ * @wqe: the wqe
+ *
+ * This is called from reset_psn() to update qp->s_num_rd_atomic
+ * for the current wqe.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void update_num_rd_atomic(struct rvt_qp *qp, u32 psn,
+ struct rvt_swqe *wqe)
+{
+ u32 opcode = wqe->wr.opcode;
+
+ if (opcode == IB_WR_RDMA_READ ||
+ opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ qp->s_num_rd_atomic++;
+ } else if (opcode == IB_WR_TID_RDMA_READ) {
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (cmp_psn(psn, wqe->lpsn) <= 0) {
+ u32 cur_seg;
+
+ cur_seg = (psn - wqe->psn) / priv->pkts_ps;
+ req->ack_pending = cur_seg - req->comp_seg;
+ priv->pending_tid_r_segs += req->ack_pending;
+ qp->s_num_rd_atomic += req->ack_pending;
+ trace_hfi1_tid_req_update_num_rd_atomic(qp, 0,
+ wqe->wr.opcode,
+ wqe->psn,
+ wqe->lpsn,
+ req);
+ } else {
+ priv->pending_tid_r_segs += req->total_segs;
+ qp->s_num_rd_atomic += req->total_segs;
+ }
+ }
+}
+
+/**
* reset_psn - reset the QP state to send starting from PSN
* @qp: the QP
* @psn: the packet sequence number to restart at
@@ -949,9 +1503,13 @@
u32 n = qp->s_acked;
struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n);
u32 opcode;
+ struct hfi1_qp_priv *priv = qp->priv;
lockdep_assert_held(&qp->s_lock);
qp->s_cur = n;
+ priv->pending_tid_r_segs = 0;
+ priv->pending_tid_w_resp = 0;
+ qp->s_num_rd_atomic = 0;
/*
* If we are starting the request from the beginning,
@@ -961,9 +1519,9 @@
qp->s_state = OP(SEND_LAST);
goto done;
}
+ update_num_rd_atomic(qp, psn, wqe);
/* Find the work request opcode corresponding to the given PSN. */
- opcode = wqe->wr.opcode;
for (;;) {
int diff;
@@ -973,8 +1531,11 @@
break;
wqe = rvt_get_swqe_ptr(qp, n);
diff = cmp_psn(psn, wqe->psn);
- if (diff < 0)
+ if (diff < 0) {
+ /* Point wqe back to the previous one*/
+ wqe = rvt_get_swqe_ptr(qp, qp->s_cur);
break;
+ }
qp->s_cur = n;
/*
* If we are starting the request from the beginning,
@@ -984,8 +1545,10 @@
qp->s_state = OP(SEND_LAST);
goto done;
}
- opcode = wqe->wr.opcode;
+
+ update_num_rd_atomic(qp, psn, wqe);
}
+ opcode = wqe->wr.opcode;
/*
* Set the state to restart in the middle of a request.
@@ -1003,10 +1566,18 @@
qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
break;
+ case IB_WR_TID_RDMA_WRITE:
+ qp->s_state = TID_OP(WRITE_RESP);
+ break;
+
case IB_WR_RDMA_READ:
qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
break;
+ case IB_WR_TID_RDMA_READ:
+ qp->s_state = TID_OP(READ_RESP);
+ break;
+
default:
/*
* This case shouldn't happen since its only
@@ -1015,6 +1586,7 @@
qp->s_state = OP(SEND_LAST);
}
done:
+ priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
qp->s_psn = psn;
/*
* Set RVT_S_WAIT_PSN as rc_complete() may start the timer
@@ -1025,6 +1597,7 @@
(cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
qp->s_flags |= RVT_S_WAIT_PSN;
qp->s_flags &= ~HFI1_S_AHG_VALID;
+ trace_hfi1_sender_reset_psn(qp);
}
/*
@@ -1033,18 +1606,47 @@
*/
void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait)
{
+ struct hfi1_qp_priv *priv = qp->priv;
struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
struct hfi1_ibport *ibp;
lockdep_assert_held(&qp->r_lock);
lockdep_assert_held(&qp->s_lock);
+ trace_hfi1_sender_restart_rc(qp);
if (qp->s_retry == 0) {
if (qp->s_mig_state == IB_MIG_ARMED) {
hfi1_migrate_qp(qp);
qp->s_retry = qp->s_retry_cnt;
} else if (qp->s_last == qp->s_acked) {
- hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
- rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ /*
+ * We need special handling for the OPFN request WQEs as
+ * they are not allowed to generate real user errors
+ */
+ if (wqe->wr.opcode == IB_WR_OPFN) {
+ struct hfi1_ibport *ibp =
+ to_iport(qp->ibqp.device, qp->port_num);
+ /*
+ * Call opfn_conn_reply() with capcode and
+ * remaining data as 0 to close out the
+ * current request
+ */
+ opfn_conn_reply(qp, priv->opfn.curr);
+ wqe = do_rc_completion(qp, wqe, ibp);
+ qp->s_flags &= ~RVT_S_WAIT_ACK;
+ } else {
+ trace_hfi1_tid_write_sender_restart_rc(qp, 0);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ struct tid_rdma_request *req;
+
+ req = wqe_to_tid_req(wqe);
+ hfi1_kern_exp_rcv_clear_all(req);
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+ }
+
+ hfi1_trdma_send_complete(qp, wqe,
+ IB_WC_RETRY_EXC_ERR);
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ }
return;
} else { /* need to handle delayed completion */
return;
@@ -1054,14 +1656,15 @@
}
ibp = to_iport(qp->ibqp.device, qp->port_num);
- if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ)
ibp->rvp.n_rc_resends++;
else
ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR |
RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN |
- RVT_S_WAIT_ACK);
+ RVT_S_WAIT_ACK | HFI1_S_WAIT_TID_RESP);
if (wait)
qp->s_flags |= RVT_S_SEND_ONE;
reset_psn(qp, psn);
@@ -1069,7 +1672,8 @@
/*
* Set qp->s_sending_psn to the next PSN after the given one.
- * This would be psn+1 except when RDMA reads are present.
+ * This would be psn+1 except when RDMA reads or TID RDMA ops
+ * are present.
*/
static void reset_sending_psn(struct rvt_qp *qp, u32 psn)
{
@@ -1081,7 +1685,9 @@
for (;;) {
wqe = rvt_get_swqe_ptr(qp, n);
if (cmp_psn(psn, wqe->lpsn) <= 0) {
- if (wqe->wr.opcode == IB_WR_RDMA_READ)
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
qp->s_sending_psn = wqe->lpsn + 1;
else
qp->s_sending_psn = psn + 1;
@@ -1094,6 +1700,36 @@
}
}
+/**
+ * hfi1_rc_verbs_aborted - handle abort status
+ * @qp: the QP
+ * @opah: the opa header
+ *
+ * This code modifies both ACK bit in BTH[2]
+ * and the s_flags to go into send one mode.
+ *
+ * This serves to throttle the send engine to only
+ * send a single packet in the likely case the
+ * a link has gone down.
+ */
+void hfi1_rc_verbs_aborted(struct rvt_qp *qp, struct hfi1_opa_header *opah)
+{
+ struct ib_other_headers *ohdr = hfi1_get_rc_ohdr(opah);
+ u8 opcode = ib_bth_get_opcode(ohdr);
+ u32 psn;
+
+ /* ignore responses */
+ if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+ opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
+ opcode == TID_OP(READ_RESP) ||
+ opcode == TID_OP(WRITE_RESP))
+ return;
+
+ psn = ib_bth_get_psn(ohdr) | IB_BTH_REQ_ACK;
+ ohdr->bth[2] = cpu_to_be32(psn);
+ qp->s_flags |= RVT_S_SEND_ONE;
+}
+
/*
* This should be called with the QP s_lock held and interrupts disabled.
*/
@@ -1102,70 +1738,104 @@
struct ib_other_headers *ohdr;
struct hfi1_qp_priv *priv = qp->priv;
struct rvt_swqe *wqe;
- struct ib_header *hdr = NULL;
- struct hfi1_16b_header *hdr_16b = NULL;
- u32 opcode;
+ u32 opcode, head, tail;
u32 psn;
+ struct tid_rdma_request *req;
lockdep_assert_held(&qp->s_lock);
if (!(ib_rvt_state_ops[qp->state] & RVT_SEND_OR_FLUSH_OR_RECV_OK))
return;
- /* Find out where the BTH is */
- if (priv->hdr_type == HFI1_PKT_TYPE_9B) {
- hdr = &opah->ibh;
- if (ib_get_lnh(hdr) == HFI1_LRH_BTH)
- ohdr = &hdr->u.oth;
- else
- ohdr = &hdr->u.l.oth;
- } else {
- u8 l4;
-
- hdr_16b = &opah->opah;
- l4 = hfi1_16B_get_l4(hdr_16b);
- if (l4 == OPA_16B_L4_IB_LOCAL)
- ohdr = &hdr_16b->u.oth;
- else
- ohdr = &hdr_16b->u.l.oth;
- }
-
+ ohdr = hfi1_get_rc_ohdr(opah);
opcode = ib_bth_get_opcode(ohdr);
- if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
- opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+ if ((opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+ opcode <= OP(ATOMIC_ACKNOWLEDGE)) ||
+ opcode == TID_OP(READ_RESP) ||
+ opcode == TID_OP(WRITE_RESP)) {
WARN_ON(!qp->s_rdma_ack_cnt);
qp->s_rdma_ack_cnt--;
return;
}
psn = ib_bth_get_psn(ohdr);
- reset_sending_psn(qp, psn);
+ /*
+ * Don't attempt to reset the sending PSN for packets in the
+ * KDETH PSN space since the PSN does not match anything.
+ */
+ if (opcode != TID_OP(WRITE_DATA) &&
+ opcode != TID_OP(WRITE_DATA_LAST) &&
+ opcode != TID_OP(ACK) && opcode != TID_OP(RESYNC))
+ reset_sending_psn(qp, psn);
+
+ /* Handle TID RDMA WRITE packets differently */
+ if (opcode >= TID_OP(WRITE_REQ) &&
+ opcode <= TID_OP(WRITE_DATA_LAST)) {
+ head = priv->s_tid_head;
+ tail = priv->s_tid_cur;
+ /*
+ * s_tid_cur is set to s_tid_head in the case, where
+ * a new TID RDMA request is being started and all
+ * previous ones have been completed.
+ * Therefore, we need to do a secondary check in order
+ * to properly determine whether we should start the
+ * RC timer.
+ */
+ wqe = rvt_get_swqe_ptr(qp, tail);
+ req = wqe_to_tid_req(wqe);
+ if (head == tail && req->comp_seg < req->total_segs) {
+ if (tail == 0)
+ tail = qp->s_size - 1;
+ else
+ tail -= 1;
+ }
+ } else {
+ head = qp->s_tail;
+ tail = qp->s_acked;
+ }
/*
* Start timer after a packet requesting an ACK has been sent and
* there are still requests that haven't been acked.
*/
- if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+ if ((psn & IB_BTH_REQ_ACK) && tail != head &&
+ opcode != TID_OP(WRITE_DATA) && opcode != TID_OP(WRITE_DATA_LAST) &&
+ opcode != TID_OP(RESYNC) &&
!(qp->s_flags &
- (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
- (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
- rvt_add_retry_timer(qp);
+ (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) &&
+ (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ if (opcode == TID_OP(READ_REQ))
+ rvt_add_retry_timer_ext(qp, priv->timeout_shift);
+ else
+ rvt_add_retry_timer(qp);
+ }
+
+ /* Start TID RDMA ACK timer */
+ if ((opcode == TID_OP(WRITE_DATA) ||
+ opcode == TID_OP(WRITE_DATA_LAST) ||
+ opcode == TID_OP(RESYNC)) &&
+ (psn & IB_BTH_REQ_ACK) &&
+ !(priv->s_flags & HFI1_S_TID_RETRY_TIMER) &&
+ (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ /*
+ * The TID RDMA ACK packet could be received before this
+ * function is called. Therefore, add the timer only if TID
+ * RDMA ACK packets are actually pending.
+ */
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ req = wqe_to_tid_req(wqe);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ req->ack_seg < req->cur_seg)
+ hfi1_add_tid_retry_timer(qp);
+ }
while (qp->s_last != qp->s_acked) {
- u32 s_last;
-
wqe = rvt_get_swqe_ptr(qp, qp->s_last);
if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 &&
cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
break;
- s_last = qp->s_last;
- trace_hfi1_qp_send_completion(qp, wqe, s_last);
- if (++s_last >= qp->s_size)
- s_last = 0;
- qp->s_last = s_last;
- /* see post_send() */
- barrier();
- rvt_put_swqe(wqe);
- rvt_qp_swqe_complete(qp,
+ trdma_clean_swqe(qp, wqe);
+ trace_hfi1_qp_send_completion(qp, wqe, qp->s_last);
+ rvt_qp_complete_swqe(qp,
wqe,
ib_hfi1_wc_opcode[wqe->wr.opcode],
IB_WC_SUCCESS);
@@ -1194,29 +1864,24 @@
* This is similar to hfi1_send_complete but has to check to be sure
* that the SGEs are not being referenced if the SWQE is being resent.
*/
-static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
- struct rvt_swqe *wqe,
- struct hfi1_ibport *ibp)
+struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
+ struct rvt_swqe *wqe,
+ struct hfi1_ibport *ibp)
{
+ struct hfi1_qp_priv *priv = qp->priv;
+
lockdep_assert_held(&qp->s_lock);
/*
* Don't decrement refcount and don't generate a
* completion if the SWQE is being resent until the send
* is finished.
*/
+ trace_hfi1_rc_completion(qp, wqe->lpsn);
if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 ||
cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
- u32 s_last;
-
- rvt_put_swqe(wqe);
- s_last = qp->s_last;
- trace_hfi1_qp_send_completion(qp, wqe, s_last);
- if (++s_last >= qp->s_size)
- s_last = 0;
- qp->s_last = s_last;
- /* see post_send() */
- barrier();
- rvt_qp_swqe_complete(qp,
+ trdma_clean_swqe(qp, wqe);
+ trace_hfi1_qp_send_completion(qp, wqe, qp->s_last);
+ rvt_qp_complete_swqe(qp,
wqe,
ib_hfi1_wc_opcode[wqe->wr.opcode],
IB_WC_SUCCESS);
@@ -1241,7 +1906,16 @@
}
qp->s_retry = qp->s_retry_cnt;
- update_last_psn(qp, wqe->lpsn);
+ /*
+ * Don't update the last PSN if the request being completed is
+ * a TID RDMA WRITE request.
+ * Completion of the TID RDMA WRITE requests are done by the
+ * TID RDMA ACKs and as such could be for a request that has
+ * already been ACKed as far as the IB state machine is
+ * concerned.
+ */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+ update_last_psn(qp, wqe->lpsn);
/*
* If we are completing a request which is in the process of
@@ -1264,9 +1938,61 @@
qp->s_draining = 0;
wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
}
+ if (priv->s_flags & HFI1_S_TID_WAIT_INTERLCK) {
+ priv->s_flags &= ~HFI1_S_TID_WAIT_INTERLCK;
+ hfi1_schedule_send(qp);
+ }
return wqe;
}
+static void set_restart_qp(struct rvt_qp *qp, struct hfi1_ctxtdata *rcd)
+{
+ /* Retry this request. */
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+ qp->r_flags |= RVT_R_RDMAR_SEQ;
+ hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_SEND;
+ rvt_get_qp(qp);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+ }
+}
+
+/**
+ * update_qp_retry_state - Update qp retry state.
+ * @qp: the QP
+ * @psn: the packet sequence number of the TID RDMA WRITE RESP.
+ * @spsn: The start psn for the given TID RDMA WRITE swqe.
+ * @lpsn: The last psn for the given TID RDMA WRITE swqe.
+ *
+ * This function is called to update the qp retry state upon
+ * receiving a TID WRITE RESP after the qp is scheduled to retry
+ * a request.
+ */
+static void update_qp_retry_state(struct rvt_qp *qp, u32 psn, u32 spsn,
+ u32 lpsn)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ qp->s_psn = psn + 1;
+ /*
+ * If this is the first TID RDMA WRITE RESP packet for the current
+ * request, change the s_state so that the retry will be processed
+ * correctly. Similarly, if this is the last TID RDMA WRITE RESP
+ * packet, change the s_state and advance the s_cur.
+ */
+ if (cmp_psn(psn, lpsn) >= 0) {
+ qp->s_cur = qpriv->s_tid_cur + 1;
+ if (qp->s_cur >= qp->s_size)
+ qp->s_cur = 0;
+ qp->s_state = TID_OP(WRITE_REQ);
+ } else if (!cmp_psn(psn, spsn)) {
+ qp->s_cur = qpriv->s_tid_cur;
+ qp->s_state = TID_OP(WRITE_RESP);
+ }
+}
+
/**
* do_rc_ack - process an incoming RC ACK
* @qp: the QP the ACK came in on
@@ -1278,15 +2004,17 @@
* May be called at interrupt level, with the QP s_lock held.
* Returns 1 if OK, 0 if current operation should be aborted (NAK).
*/
-static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
- u64 val, struct hfi1_ctxtdata *rcd)
+int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode,
+ u64 val, struct hfi1_ctxtdata *rcd)
{
struct hfi1_ibport *ibp;
enum ib_wc_status status;
+ struct hfi1_qp_priv *qpriv = qp->priv;
struct rvt_swqe *wqe;
int ret = 0;
u32 ack_psn;
int diff;
+ struct rvt_dev_info *rdi;
lockdep_assert_held(&qp->s_lock);
/*
@@ -1329,20 +2057,14 @@
*/
if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
(opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+ (wqe->wr.opcode == IB_WR_TID_RDMA_READ &&
+ (opcode != TID_OP(READ_RESP) || diff != 0)) ||
((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
- (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
- /* Retry this request. */
- if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
- qp->r_flags |= RVT_R_RDMAR_SEQ;
- hfi1_restart_rc(qp, qp->s_last_psn + 1, 0);
- if (list_empty(&qp->rspwait)) {
- qp->r_flags |= RVT_R_RSP_SEND;
- rvt_get_qp(qp);
- list_add_tail(&qp->rspwait,
- &rcd->qp_wait_list);
- }
- }
+ (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0)) ||
+ (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ (delta_psn(psn, qp->s_last_psn) != 1))) {
+ set_restart_qp(qp, rcd);
/*
* No need to process the ACK/NAK since we are
* restarting an earlier request.
@@ -1354,6 +2076,9 @@
u64 *vaddr = wqe->sg_list[0].vaddr;
*vaddr = val;
}
+ if (wqe->wr.opcode == IB_WR_OPFN)
+ opfn_conn_reply(qp, val);
+
if (qp->s_num_rd_atomic &&
(wqe->wr.opcode == IB_WR_RDMA_READ ||
wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
@@ -1371,26 +2096,85 @@
hfi1_schedule_send(qp);
}
}
+
+ /*
+ * TID RDMA WRITE requests will be completed by the TID RDMA
+ * ACK packet handler (see tid_rdma.c).
+ */
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+ break;
+
wqe = do_rc_completion(qp, wqe, ibp);
if (qp->s_acked == qp->s_tail)
break;
}
+ trace_hfi1_rc_ack_do(qp, aeth, psn, wqe);
+ trace_hfi1_sender_do_rc_ack(qp);
switch (aeth >> IB_AETH_NAK_SHIFT) {
case 0: /* ACK */
this_cpu_inc(*ibp->rvp.rc_acks);
- if (qp->s_acked != qp->s_tail) {
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ if (wqe_to_tid_req(wqe)->ack_pending)
+ rvt_mod_retry_timer_ext(qp,
+ qpriv->timeout_shift);
+ else
+ rvt_stop_rc_timers(qp);
+ } else if (qp->s_acked != qp->s_tail) {
+ struct rvt_swqe *__w = NULL;
+
+ if (qpriv->s_tid_cur != HFI1_QP_WQE_INVALID)
+ __w = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
+
/*
- * We are expecting more ACKs so
- * mod the retry timer.
+ * Stop timers if we've received all of the TID RDMA
+ * WRITE * responses.
*/
- rvt_mod_retry_timer(qp);
- /*
- * We can stop re-sending the earlier packets and
- * continue with the next packet the receiver wants.
- */
- if (cmp_psn(qp->s_psn, psn) <= 0)
- reset_psn(qp, psn + 1);
+ if (__w && __w->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ opcode == TID_OP(WRITE_RESP)) {
+ /*
+ * Normally, the loop above would correctly
+ * process all WQEs from s_acked onward and
+ * either complete them or check for correct
+ * PSN sequencing.
+ * However, for TID RDMA, due to pipelining,
+ * the response may not be for the request at
+ * s_acked so the above look would just be
+ * skipped. This does not allow for checking
+ * the PSN sequencing. It has to be done
+ * separately.
+ */
+ if (cmp_psn(psn, qp->s_last_psn + 1)) {
+ set_restart_qp(qp, rcd);
+ goto bail_stop;
+ }
+ /*
+ * If the psn is being resent, stop the
+ * resending.
+ */
+ if (qp->s_cur != qp->s_tail &&
+ cmp_psn(qp->s_psn, psn) <= 0)
+ update_qp_retry_state(qp, psn,
+ __w->psn,
+ __w->lpsn);
+ else if (--qpriv->pending_tid_w_resp)
+ rvt_mod_retry_timer(qp);
+ else
+ rvt_stop_rc_timers(qp);
+ } else {
+ /*
+ * We are expecting more ACKs so
+ * mod the retry timer.
+ */
+ rvt_mod_retry_timer(qp);
+ /*
+ * We can stop re-sending the earlier packets
+ * and continue with the next packet the
+ * receiver wants.
+ */
+ if (cmp_psn(qp->s_psn, psn) <= 0)
+ reset_psn(qp, psn + 1);
+ }
} else {
/* No more acks - kill all timers */
rvt_stop_rc_timers(qp);
@@ -1406,6 +2190,15 @@
rvt_get_credit(qp, aeth);
qp->s_rnr_retry = qp->s_rnr_retry_cnt;
qp->s_retry = qp->s_retry_cnt;
+ /*
+ * If the current request is a TID RDMA WRITE request and the
+ * response is not a TID RDMA WRITE RESP packet, s_last_psn
+ * can't be advanced.
+ */
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ opcode != TID_OP(WRITE_RESP) &&
+ cmp_psn(psn, wqe->psn) >= 0)
+ return 1;
update_last_psn(qp, psn);
return 1;
@@ -1415,20 +2208,31 @@
goto bail_stop;
if (qp->s_flags & RVT_S_WAIT_RNR)
goto bail_stop;
- if (qp->s_rnr_retry == 0) {
- status = IB_WC_RNR_RETRY_EXC_ERR;
- goto class_b;
+ rdi = ib_to_rvt(qp->ibqp.device);
+ if (!(rdi->post_parms[wqe->wr.opcode].flags &
+ RVT_OPERATION_IGN_RNR_CNT)) {
+ if (qp->s_rnr_retry == 0) {
+ status = IB_WC_RNR_RETRY_EXC_ERR;
+ goto class_b;
+ }
+ if (qp->s_rnr_retry_cnt < 7 && qp->s_rnr_retry_cnt > 0)
+ qp->s_rnr_retry--;
}
- if (qp->s_rnr_retry_cnt < 7)
- qp->s_rnr_retry--;
- /* The last valid PSN is the previous PSN. */
- update_last_psn(qp, psn - 1);
+ /*
+ * The last valid PSN is the previous PSN. For TID RDMA WRITE
+ * request, s_last_psn should be incremented only when a TID
+ * RDMA WRITE RESP is received to avoid skipping lost TID RDMA
+ * WRITE RESP packets.
+ */
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+ reset_psn(qp, qp->s_last_psn + 1);
+ } else {
+ update_last_psn(qp, psn - 1);
+ reset_psn(qp, psn);
+ }
ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn);
-
- reset_psn(qp, psn);
-
qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK);
rvt_stop_rc_timers(qp);
rvt_add_rnr_timer(qp, aeth);
@@ -1468,7 +2272,10 @@
ibp->rvp.n_other_naks++;
class_b:
if (qp->s_last == qp->s_acked) {
- hfi1_send_complete(qp, wqe, status);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ hfi1_kern_read_tid_flow_free(qp);
+
+ hfi1_trdma_send_complete(qp, wqe, status);
rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
}
break;
@@ -1509,6 +2316,8 @@
while (cmp_psn(psn, wqe->lpsn) > 0) {
if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_WRITE ||
wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
break;
@@ -1644,7 +2453,8 @@
qp->s_rdma_read_len -= pmtu;
update_last_psn(qp, psn);
spin_unlock_irqrestore(&qp->s_lock, flags);
- hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, false, false);
+ rvt_copy_sge(qp, &qp->s_rdma_read_sge,
+ data, pmtu, false, false);
goto bail;
case OP(RDMA_READ_RESPONSE_ONLY):
@@ -1684,7 +2494,8 @@
if (unlikely(tlen != qp->s_rdma_read_len))
goto ack_len_err;
aeth = be32_to_cpu(ohdr->u.aeth);
- hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, false, false);
+ rvt_copy_sge(qp, &qp->s_rdma_read_sge,
+ data, tlen, false, false);
WARN_ON(qp->s_rdma_read_sge.num_sge);
(void)do_rc_ack(qp, aeth, psn,
OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
@@ -1704,7 +2515,7 @@
status = IB_WC_LOC_LEN_ERR;
ack_err:
if (qp->s_last == qp->s_acked) {
- hfi1_send_complete(qp, wqe, status);
+ rvt_send_complete(qp, wqe, status);
rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
}
ack_done:
@@ -1713,16 +2524,6 @@
return;
}
-static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
- struct rvt_qp *qp)
-{
- if (list_empty(&qp->rspwait)) {
- qp->r_flags |= RVT_R_RSP_NAK;
- rvt_get_qp(qp);
- list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
- }
-}
-
static inline void rc_cancel_ack(struct rvt_qp *qp)
{
qp->r_adefered = 0;
@@ -1755,8 +2556,9 @@
struct hfi1_ibport *ibp = rcd_to_iport(rcd);
struct rvt_ack_entry *e;
unsigned long flags;
- u8 i, prev;
- int old_req;
+ u8 prev;
+ u8 mra; /* most recent ACK */
+ bool old_req;
trace_hfi1_rcv_error(qp, psn);
if (diff > 0) {
@@ -1802,29 +2604,8 @@
spin_lock_irqsave(&qp->s_lock, flags);
- for (i = qp->r_head_ack_queue; ; i = prev) {
- if (i == qp->s_tail_ack_queue)
- old_req = 0;
- if (i)
- prev = i - 1;
- else
- prev = HFI1_MAX_RDMA_ATOMIC;
- if (prev == qp->r_head_ack_queue) {
- e = NULL;
- break;
- }
- e = &qp->s_ack_queue[prev];
- if (!e->opcode) {
- e = NULL;
- break;
- }
- if (cmp_psn(psn, e->psn) >= 0) {
- if (prev == qp->s_tail_ack_queue &&
- cmp_psn(psn, e->lpsn) <= 0)
- old_req = 0;
- break;
- }
- }
+ e = find_prev_entry(qp, psn, &prev, &mra, &old_req);
+
switch (opcode) {
case OP(RDMA_READ_REQUEST): {
struct ib_reth *reth;
@@ -1850,10 +2631,7 @@
len = be32_to_cpu(reth->length);
if (unlikely(offset + len != e->rdma_sge.sge_length))
goto unlock_done;
- if (e->rdma_sge.mr) {
- rvt_put_mr(e->rdma_sge.mr);
- e->rdma_sge.mr = NULL;
- }
+ release_rdma_sge_mr(e);
if (len != 0) {
u32 rkey = be32_to_cpu(reth->rkey);
u64 vaddr = get_ib_reth_vaddr(reth);
@@ -1871,6 +2649,8 @@
e->psn = psn;
if (old_req)
goto unlock_done;
+ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue = prev;
qp->s_tail_ack_queue = prev;
break;
}
@@ -1884,6 +2664,8 @@
*/
if (!e || e->opcode != (u8)opcode || old_req)
goto unlock_done;
+ if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
+ qp->s_acked_ack_queue = prev;
qp->s_tail_ack_queue = prev;
break;
}
@@ -1899,7 +2681,7 @@
* Resend the most recent ACK if this request is
* after all the previous RDMA reads and atomics.
*/
- if (i == qp->r_head_ack_queue) {
+ if (mra == qp->r_head_ack_queue) {
spin_unlock_irqrestore(&qp->s_lock, flags);
qp->r_nak_state = 0;
qp->r_ack_psn = qp->r_psn - 1;
@@ -1910,7 +2692,9 @@
* Resend the RDMA read or atomic op which
* ACKs this duplicate request.
*/
- qp->s_tail_ack_queue = i;
+ if (qp->s_tail_ack_queue == qp->s_acked_ack_queue)
+ qp->s_acked_ack_queue = mra;
+ qp->s_tail_ack_queue = mra;
break;
}
qp->s_ack_state = OP(ACKNOWLEDGE);
@@ -1927,17 +2711,6 @@
return 0;
}
-static inline void update_ack_queue(struct rvt_qp *qp, unsigned n)
-{
- unsigned next;
-
- next = n + 1;
- if (next > HFI1_MAX_RDMA_ATOMIC)
- next = 0;
- qp->s_tail_ack_queue = next;
- qp->s_ack_state = OP(ACKNOWLEDGE);
-}
-
static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid,
u32 lqpn, u32 rqpn, u8 svc_type)
{
@@ -2035,6 +2808,7 @@
void *data = packet->payload;
u32 tlen = packet->tlen;
struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
struct hfi1_ibport *ibp = rcd_to_iport(rcd);
struct ib_other_headers *ohdr = packet->ohdr;
u32 opcode = packet->opcode;
@@ -2047,8 +2821,7 @@
struct ib_reth *reth;
unsigned long flags;
int ret;
- bool is_fecn = false;
- bool copy_last = false;
+ bool copy_last = false, fecn;
u32 rkey;
u8 extra_bytes = pad + packet->extra_byte + (SIZE_OF_CRC << 2);
@@ -2057,7 +2830,8 @@
if (hfi1_ruc_check_hdr(ibp, packet))
return;
- is_fecn = process_ecn(qp, packet, false);
+ fecn = process_ecn(qp, packet);
+ opfn_trigger_conn_request(qp, be32_to_cpu(ohdr->bth[1]));
/*
* Process responses (ACKs) before anything else. Note that the
@@ -2068,8 +2842,6 @@
if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
rc_rcv_resp(packet);
- if (is_fecn)
- goto send_ack;
return;
}
@@ -2144,7 +2916,7 @@
qp->r_rcv_len += pmtu;
if (unlikely(qp->r_rcv_len > qp->r_len))
goto nack_inv;
- hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false);
+ rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false);
break;
case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
@@ -2200,7 +2972,7 @@
wc.byte_len = tlen + qp->r_rcv_len;
if (unlikely(wc.byte_len > qp->r_len))
goto nack_inv;
- hfi1_copy_sge(&qp->r_sge, data, tlen, true, copy_last);
+ rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, copy_last);
rvt_put_ss(&qp->r_sge);
qp->r_msn++;
if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
@@ -2233,8 +3005,7 @@
wc.dlid_path_bits = 0;
wc.port_num = 0;
/* Signal completion event if the solicited bit is set. */
- rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
- ib_bth_is_solicited(ohdr));
+ rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
break;
case OP(RDMA_WRITE_ONLY):
@@ -2291,20 +3062,17 @@
if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
goto nack_inv;
next = qp->r_head_ack_queue + 1;
- /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
- if (next > HFI1_MAX_RDMA_ATOMIC)
+ /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0;
spin_lock_irqsave(&qp->s_lock, flags);
- if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (unlikely(next == qp->s_acked_ack_queue)) {
if (!qp->s_ack_queue[next].sent)
goto nack_inv_unlck;
update_ack_queue(qp, next);
}
e = &qp->s_ack_queue[qp->r_head_ack_queue];
- if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
- rvt_put_mr(e->rdma_sge.mr);
- e->rdma_sge.mr = NULL;
- }
+ release_rdma_sge_mr(e);
reth = &ohdr->u.rc.reth;
len = be32_to_cpu(reth->length);
if (len) {
@@ -2342,45 +3110,49 @@
qp->r_state = opcode;
qp->r_nak_state = 0;
qp->r_head_ack_queue = next;
+ qpriv->r_tid_alloc = qp->r_head_ack_queue;
/* Schedule the send engine. */
qp->s_flags |= RVT_S_RESP_PENDING;
+ if (fecn)
+ qp->s_flags |= RVT_S_ECN;
hfi1_schedule_send(qp);
spin_unlock_irqrestore(&qp->s_lock, flags);
- if (is_fecn)
- goto send_ack;
return;
}
case OP(COMPARE_SWAP):
case OP(FETCH_ADD): {
- struct ib_atomic_eth *ateth;
+ struct ib_atomic_eth *ateth = &ohdr->u.atomic_eth;
+ u64 vaddr = get_ib_ateth_vaddr(ateth);
+ bool opfn = opcode == OP(COMPARE_SWAP) &&
+ vaddr == HFI1_VERBS_E_ATOMIC_VADDR;
struct rvt_ack_entry *e;
- u64 vaddr;
atomic64_t *maddr;
u64 sdata;
u32 rkey;
u8 next;
- if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+ !opfn))
goto nack_inv;
next = qp->r_head_ack_queue + 1;
- if (next > HFI1_MAX_RDMA_ATOMIC)
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
next = 0;
spin_lock_irqsave(&qp->s_lock, flags);
- if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (unlikely(next == qp->s_acked_ack_queue)) {
if (!qp->s_ack_queue[next].sent)
goto nack_inv_unlck;
update_ack_queue(qp, next);
}
e = &qp->s_ack_queue[qp->r_head_ack_queue];
- if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
- rvt_put_mr(e->rdma_sge.mr);
- e->rdma_sge.mr = NULL;
+ release_rdma_sge_mr(e);
+ /* Process OPFN special virtual address */
+ if (opfn) {
+ opfn_conn_response(qp, e, ateth);
+ goto ack;
}
- ateth = &ohdr->u.atomic_eth;
- vaddr = get_ib_ateth_vaddr(ateth);
if (unlikely(vaddr & (sizeof(u64) - 1)))
goto nack_inv_unlck;
rkey = be32_to_cpu(ateth->rkey);
@@ -2399,6 +3171,7 @@
sdata);
rvt_put_mr(qp->r_sge.sge.mr);
qp->r_sge.num_sge = 0;
+ack:
e->opcode = opcode;
e->sent = 0;
e->psn = psn;
@@ -2408,14 +3181,15 @@
qp->r_state = opcode;
qp->r_nak_state = 0;
qp->r_head_ack_queue = next;
+ qpriv->r_tid_alloc = qp->r_head_ack_queue;
/* Schedule the send engine. */
qp->s_flags |= RVT_S_RESP_PENDING;
+ if (fecn)
+ qp->s_flags |= RVT_S_ECN;
hfi1_schedule_send(qp);
spin_unlock_irqrestore(&qp->s_lock, flags);
- if (is_fecn)
- goto send_ack;
return;
}
@@ -2428,16 +3202,9 @@
qp->r_ack_psn = psn;
qp->r_nak_state = 0;
/* Send an ACK if requested or required. */
- if (psn & IB_BTH_REQ_ACK) {
- if (packet->numpkt == 0) {
- rc_cancel_ack(qp);
- goto send_ack;
- }
- if (qp->r_adefered >= HFI1_PSN_CREDIT) {
- rc_cancel_ack(qp);
- goto send_ack;
- }
- if (unlikely(is_fecn)) {
+ if (psn & IB_BTH_REQ_ACK || fecn) {
+ if (packet->numpkt == 0 || fecn ||
+ qp->r_adefered >= HFI1_PSN_CREDIT) {
rc_cancel_ack(qp);
goto send_ack;
}
@@ -2478,7 +3245,7 @@
qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
qp->r_ack_psn = qp->r_psn;
send_ack:
- hfi1_send_rc_ack(packet, is_fecn);
+ hfi1_send_rc_ack(packet, fecn);
}
void hfi1_rc_hdrerr(
diff --git a/drivers/infiniband/hw/hfi1/rc.h b/drivers/infiniband/hw/hfi1/rc.h
new file mode 100644
index 0000000..5ed5e85
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/rc.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+
+#ifndef HFI1_RC_H
+#define HFI1_RC_H
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static inline void update_ack_queue(struct rvt_qp *qp, unsigned int n)
+{
+ unsigned int next;
+
+ next = n + 1;
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ next = 0;
+ qp->s_tail_ack_queue = next;
+ qp->s_acked_ack_queue = next;
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd,
+ struct rvt_qp *qp)
+{
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_NAK;
+ rvt_get_qp(qp);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+}
+
+static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
+ u32 psn, u32 pmtu)
+{
+ u32 len;
+
+ len = delta_psn(psn, wqe->psn) * pmtu;
+ return rvt_restart_sge(ss, wqe, len);
+}
+
+static inline void release_rdma_sge_mr(struct rvt_ack_entry *e)
+{
+ if (e->rdma_sge.mr) {
+ rvt_put_mr(e->rdma_sge.mr);
+ e->rdma_sge.mr = NULL;
+ }
+}
+
+struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
+ u8 *prev_ack, bool *scheduled);
+int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val,
+ struct hfi1_ctxtdata *rcd);
+struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct hfi1_ibport *ibp);
+
+#endif /* HFI1_RC_H */
diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c
index 5f56f3c..23ac605 100644
--- a/drivers/infiniband/hw/hfi1/ruc.c
+++ b/drivers/infiniband/hw/hfi1/ruc.c
@@ -156,333 +156,6 @@
}
/**
- * ruc_loopback - handle UC and RC loopback requests
- * @sqp: the sending QP
- *
- * This is called from hfi1_do_send() to
- * forward a WQE addressed to the same HFI.
- * Note that although we are single threaded due to the send engine, we still
- * have to protect against post_send(). We don't have to worry about
- * receive interrupts since this is a connected protocol and all packets
- * will pass through here.
- */
-static void ruc_loopback(struct rvt_qp *sqp)
-{
- struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
- struct rvt_qp *qp;
- struct rvt_swqe *wqe;
- struct rvt_sge *sge;
- unsigned long flags;
- struct ib_wc wc;
- u64 sdata;
- atomic64_t *maddr;
- enum ib_wc_status send_status;
- bool release;
- int ret;
- bool copy_last = false;
- int local_ops = 0;
-
- rcu_read_lock();
-
- /*
- * Note that we check the responder QP state after
- * checking the requester's state.
- */
- qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
- sqp->remote_qpn);
-
- spin_lock_irqsave(&sqp->s_lock, flags);
-
- /* Return if we are already busy processing a work request. */
- if ((sqp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT)) ||
- !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
- goto unlock;
-
- sqp->s_flags |= RVT_S_BUSY;
-
-again:
- if (sqp->s_last == READ_ONCE(sqp->s_head))
- goto clr_busy;
- wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
-
- /* Return if it is not OK to start a new work request. */
- if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
- if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
- goto clr_busy;
- /* We are in the error state, flush the work request. */
- send_status = IB_WC_WR_FLUSH_ERR;
- goto flush_send;
- }
-
- /*
- * We can rely on the entry not changing without the s_lock
- * being held until we update s_last.
- * We increment s_cur to indicate s_last is in progress.
- */
- if (sqp->s_last == sqp->s_cur) {
- if (++sqp->s_cur >= sqp->s_size)
- sqp->s_cur = 0;
- }
- spin_unlock_irqrestore(&sqp->s_lock, flags);
-
- if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
- qp->ibqp.qp_type != sqp->ibqp.qp_type) {
- ibp->rvp.n_pkt_drops++;
- /*
- * For RC, the requester would timeout and retry so
- * shortcut the timeouts and just signal too many retries.
- */
- if (sqp->ibqp.qp_type == IB_QPT_RC)
- send_status = IB_WC_RETRY_EXC_ERR;
- else
- send_status = IB_WC_SUCCESS;
- goto serr;
- }
-
- memset(&wc, 0, sizeof(wc));
- send_status = IB_WC_SUCCESS;
-
- release = true;
- sqp->s_sge.sge = wqe->sg_list[0];
- sqp->s_sge.sg_list = wqe->sg_list + 1;
- sqp->s_sge.num_sge = wqe->wr.num_sge;
- sqp->s_len = wqe->length;
- switch (wqe->wr.opcode) {
- case IB_WR_REG_MR:
- goto send_comp;
-
- case IB_WR_LOCAL_INV:
- if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
- if (rvt_invalidate_rkey(sqp,
- wqe->wr.ex.invalidate_rkey))
- send_status = IB_WC_LOC_PROT_ERR;
- local_ops = 1;
- }
- goto send_comp;
-
- case IB_WR_SEND_WITH_INV:
- if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) {
- wc.wc_flags = IB_WC_WITH_INVALIDATE;
- wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey;
- }
- goto send;
-
- case IB_WR_SEND_WITH_IMM:
- wc.wc_flags = IB_WC_WITH_IMM;
- wc.ex.imm_data = wqe->wr.ex.imm_data;
- /* FALLTHROUGH */
- case IB_WR_SEND:
-send:
- ret = rvt_get_rwqe(qp, false);
- if (ret < 0)
- goto op_err;
- if (!ret)
- goto rnr_nak;
- break;
-
- case IB_WR_RDMA_WRITE_WITH_IMM:
- if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
- goto inv_err;
- wc.wc_flags = IB_WC_WITH_IMM;
- wc.ex.imm_data = wqe->wr.ex.imm_data;
- ret = rvt_get_rwqe(qp, true);
- if (ret < 0)
- goto op_err;
- if (!ret)
- goto rnr_nak;
- /* skip copy_last set and qp_access_flags recheck */
- goto do_write;
- case IB_WR_RDMA_WRITE:
- copy_last = rvt_is_user_qp(qp);
- if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
- goto inv_err;
-do_write:
- if (wqe->length == 0)
- break;
- if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
- wqe->rdma_wr.remote_addr,
- wqe->rdma_wr.rkey,
- IB_ACCESS_REMOTE_WRITE)))
- goto acc_err;
- qp->r_sge.sg_list = NULL;
- qp->r_sge.num_sge = 1;
- qp->r_sge.total_len = wqe->length;
- break;
-
- case IB_WR_RDMA_READ:
- if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
- goto inv_err;
- if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
- wqe->rdma_wr.remote_addr,
- wqe->rdma_wr.rkey,
- IB_ACCESS_REMOTE_READ)))
- goto acc_err;
- release = false;
- sqp->s_sge.sg_list = NULL;
- sqp->s_sge.num_sge = 1;
- qp->r_sge.sge = wqe->sg_list[0];
- qp->r_sge.sg_list = wqe->sg_list + 1;
- qp->r_sge.num_sge = wqe->wr.num_sge;
- qp->r_sge.total_len = wqe->length;
- break;
-
- case IB_WR_ATOMIC_CMP_AND_SWP:
- case IB_WR_ATOMIC_FETCH_AND_ADD:
- if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
- goto inv_err;
- if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
- wqe->atomic_wr.remote_addr,
- wqe->atomic_wr.rkey,
- IB_ACCESS_REMOTE_ATOMIC)))
- goto acc_err;
- /* Perform atomic OP and save result. */
- maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
- sdata = wqe->atomic_wr.compare_add;
- *(u64 *)sqp->s_sge.sge.vaddr =
- (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
- (u64)atomic64_add_return(sdata, maddr) - sdata :
- (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
- sdata, wqe->atomic_wr.swap);
- rvt_put_mr(qp->r_sge.sge.mr);
- qp->r_sge.num_sge = 0;
- goto send_comp;
-
- default:
- send_status = IB_WC_LOC_QP_OP_ERR;
- goto serr;
- }
-
- sge = &sqp->s_sge.sge;
- while (sqp->s_len) {
- u32 len = sqp->s_len;
-
- if (len > sge->length)
- len = sge->length;
- if (len > sge->sge_length)
- len = sge->sge_length;
- WARN_ON_ONCE(len == 0);
- hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last);
- sge->vaddr += len;
- sge->length -= len;
- sge->sge_length -= len;
- if (sge->sge_length == 0) {
- if (!release)
- rvt_put_mr(sge->mr);
- if (--sqp->s_sge.num_sge)
- *sge = *sqp->s_sge.sg_list++;
- } else if (sge->length == 0 && sge->mr->lkey) {
- if (++sge->n >= RVT_SEGSZ) {
- if (++sge->m >= sge->mr->mapsz)
- break;
- sge->n = 0;
- }
- sge->vaddr =
- sge->mr->map[sge->m]->segs[sge->n].vaddr;
- sge->length =
- sge->mr->map[sge->m]->segs[sge->n].length;
- }
- sqp->s_len -= len;
- }
- if (release)
- rvt_put_ss(&qp->r_sge);
-
- if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
- goto send_comp;
-
- if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
- wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
- else
- wc.opcode = IB_WC_RECV;
- wc.wr_id = qp->r_wr_id;
- wc.status = IB_WC_SUCCESS;
- wc.byte_len = wqe->length;
- wc.qp = &qp->ibqp;
- wc.src_qp = qp->remote_qpn;
- wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX;
- wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr);
- wc.port_num = 1;
- /* Signal completion event if the solicited bit is set. */
- rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
- wqe->wr.send_flags & IB_SEND_SOLICITED);
-
-send_comp:
- spin_lock_irqsave(&sqp->s_lock, flags);
- ibp->rvp.n_loop_pkts++;
-flush_send:
- sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
- hfi1_send_complete(sqp, wqe, send_status);
- if (local_ops) {
- atomic_dec(&sqp->local_ops_pending);
- local_ops = 0;
- }
- goto again;
-
-rnr_nak:
- /* Handle RNR NAK */
- if (qp->ibqp.qp_type == IB_QPT_UC)
- goto send_comp;
- ibp->rvp.n_rnr_naks++;
- /*
- * Note: we don't need the s_lock held since the BUSY flag
- * makes this single threaded.
- */
- if (sqp->s_rnr_retry == 0) {
- send_status = IB_WC_RNR_RETRY_EXC_ERR;
- goto serr;
- }
- if (sqp->s_rnr_retry_cnt < 7)
- sqp->s_rnr_retry--;
- spin_lock_irqsave(&sqp->s_lock, flags);
- if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
- goto clr_busy;
- rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer <<
- IB_AETH_CREDIT_SHIFT);
- goto clr_busy;
-
-op_err:
- send_status = IB_WC_REM_OP_ERR;
- wc.status = IB_WC_LOC_QP_OP_ERR;
- goto err;
-
-inv_err:
- send_status = IB_WC_REM_INV_REQ_ERR;
- wc.status = IB_WC_LOC_QP_OP_ERR;
- goto err;
-
-acc_err:
- send_status = IB_WC_REM_ACCESS_ERR;
- wc.status = IB_WC_LOC_PROT_ERR;
-err:
- /* responder goes to error state */
- rvt_rc_error(qp, wc.status);
-
-serr:
- spin_lock_irqsave(&sqp->s_lock, flags);
- hfi1_send_complete(sqp, wqe, send_status);
- if (sqp->ibqp.qp_type == IB_QPT_RC) {
- int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
-
- sqp->s_flags &= ~RVT_S_BUSY;
- spin_unlock_irqrestore(&sqp->s_lock, flags);
- if (lastwqe) {
- struct ib_event ev;
-
- ev.device = sqp->ibqp.device;
- ev.element.qp = &sqp->ibqp;
- ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
- sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
- }
- goto done;
- }
-clr_busy:
- sqp->s_flags &= ~RVT_S_BUSY;
-unlock:
- spin_unlock_irqrestore(&sqp->s_lock, flags);
-done:
- rcu_read_unlock();
-}
-
-/**
* hfi1_make_grh - construct a GRH header
* @ibp: a pointer to the IB port
* @hdr: a pointer to the GRH header being constructed
@@ -577,7 +250,6 @@
struct ib_other_headers *ohdr,
u32 bth0, u32 bth1, u32 bth2)
{
- bth1 |= qp->remote_qpn;
ohdr->bth[0] = cpu_to_be32(bth0);
ohdr->bth[1] = cpu_to_be32(bth1);
ohdr->bth[2] = cpu_to_be32(bth2);
@@ -599,13 +271,13 @@
*/
static inline void hfi1_make_ruc_header_16B(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2,
+ int middle,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibport *ibp = ps->ibp;
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
- u32 bth1 = 0;
u32 slid;
u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
u8 l4 = OPA_16B_L4_IB_LOCAL;
@@ -687,12 +359,12 @@
*/
static inline void hfi1_make_ruc_header_9B(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2,
+ int middle,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_ibport *ibp = ps->ibp;
- u32 bth1 = 0;
u16 pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
u16 lrh0 = HFI1_LRH_BTH;
u8 extra_bytes = -ps->s_txreq->s_cur_size & 3;
@@ -742,7 +414,7 @@
typedef void (*hfi1_make_ruc_hdr)(struct rvt_qp *qp,
struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps);
/* We support only two types - 9B and 16B for now */
@@ -752,7 +424,7 @@
};
void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps)
{
struct hfi1_qp_priv *priv = qp->priv;
@@ -773,18 +445,21 @@
priv->s_ahg->ahgidx = 0;
/* Make the appropriate header */
- hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth2, middle, ps);
+ hfi1_ruc_header_tbl[priv->hdr_type](qp, ohdr, bth0, bth1, bth2, middle,
+ ps);
}
/* when sending, force a reschedule every one of these periods */
#define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */
/**
- * schedule_send_yield - test for a yield required for QP send engine
+ * hfi1_schedule_send_yield - test for a yield required for QP
+ * send engine
* @timeout: Final time for timeout slice for jiffies
* @qp: a pointer to QP
* @ps: a pointer to a structure with commonly lookup values for
* the the send engine progress
+ * @tid - true if it is the tid leg
*
* This routine checks if the time slice for the QP has expired
* for RC QPs, if so an additional work entry is queued. At this
@@ -792,8 +467,8 @@
* returns true if a yield is required, otherwise, false
* is returned.
*/
-static bool schedule_send_yield(struct rvt_qp *qp,
- struct hfi1_pkt_state *ps)
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+ bool tid)
{
ps->pkts_sent = true;
@@ -801,8 +476,24 @@
if (!ps->in_thread ||
workqueue_congested(ps->cpu, ps->ppd->hfi1_wq)) {
spin_lock_irqsave(&qp->s_lock, ps->flags);
- qp->s_flags &= ~RVT_S_BUSY;
- hfi1_schedule_send(qp);
+ if (!tid) {
+ qp->s_flags &= ~RVT_S_BUSY;
+ hfi1_schedule_send(qp);
+ } else {
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ if (priv->s_flags &
+ HFI1_S_TID_BUSY_SET) {
+ qp->s_flags &= ~RVT_S_BUSY;
+ priv->s_flags &=
+ ~(HFI1_S_TID_BUSY_SET |
+ RVT_S_BUSY);
+ } else {
+ priv->s_flags &= ~RVT_S_BUSY;
+ }
+ hfi1_schedule_tid_send(qp);
+ }
+
spin_unlock_irqrestore(&qp->s_lock, ps->flags);
this_cpu_inc(*ps->ppd->dd->send_schedule);
trace_hfi1_rc_expired_time_slice(qp, true);
@@ -825,15 +516,15 @@
void _hfi1_do_send(struct work_struct *work)
{
- struct iowait *wait = container_of(work, struct iowait, iowork);
- struct rvt_qp *qp = iowait_to_qp(wait);
+ struct iowait_work *w = container_of(work, struct iowait_work, iowork);
+ struct rvt_qp *qp = iowait_to_qp(w->iow);
hfi1_do_send(qp, true);
}
/**
* hfi1_do_send - perform a send on a QP
- * @work: contains a pointer to the QP
+ * @qp: a pointer to the QP
* @in_thread: true if in a workqueue thread
*
* Process entries in the send work queue until credit or queue is
@@ -850,6 +541,7 @@
ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
ps.ppd = ppd_from_ibp(ps.ibp);
ps.in_thread = in_thread;
+ ps.wait = iowait_get_ib_work(&priv->s_iowait);
trace_hfi1_rc_do_send(qp, in_thread);
@@ -858,7 +550,7 @@
if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) &
~((1 << ps.ppd->lmc) - 1)) ==
ps.ppd->lid)) {
- ruc_loopback(qp);
+ rvt_ruc_loopback(qp);
return;
}
make_req = hfi1_make_rc_req;
@@ -868,7 +560,7 @@
if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) &
~((1 << ps.ppd->lmc) - 1)) ==
ps.ppd->lid)) {
- ruc_loopback(qp);
+ rvt_ruc_loopback(qp);
return;
}
make_req = hfi1_make_uc_req;
@@ -883,6 +575,8 @@
/* Return if we are already busy processing a work request. */
if (!hfi1_send_ok(qp)) {
+ if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB);
spin_unlock_irqrestore(&qp->s_lock, ps.flags);
return;
}
@@ -896,10 +590,12 @@
ps.pkts_sent = false;
/* insure a pre-built packet is handled */
- ps.s_txreq = get_waiting_verbs_txreq(qp);
+ ps.s_txreq = get_waiting_verbs_txreq(ps.wait);
do {
/* Check for a constructed packet to be sent. */
if (ps.s_txreq) {
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET)
+ qp->s_flags |= RVT_S_BUSY;
spin_unlock_irqrestore(&qp->s_lock, ps.flags);
/*
* If the packet cannot be sent now, return and
@@ -907,8 +603,9 @@
*/
if (hfi1_verbs_send(qp, &ps))
return;
+
/* allow other tasks to run */
- if (schedule_send_yield(qp, &ps))
+ if (hfi1_schedule_send_yield(qp, &ps, false))
return;
spin_lock_irqsave(&qp->s_lock, ps.flags);
@@ -917,44 +614,3 @@
iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
spin_unlock_irqrestore(&qp->s_lock, ps.flags);
}
-
-/*
- * This should be called with s_lock held.
- */
-void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
- enum ib_wc_status status)
-{
- u32 old_last, last;
-
- if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
- return;
-
- last = qp->s_last;
- old_last = last;
- trace_hfi1_qp_send_completion(qp, wqe, last);
- if (++last >= qp->s_size)
- last = 0;
- trace_hfi1_qp_send_completion(qp, wqe, last);
- qp->s_last = last;
- /* See post_send() */
- barrier();
- rvt_put_swqe(wqe);
- if (qp->ibqp.qp_type == IB_QPT_UD ||
- qp->ibqp.qp_type == IB_QPT_SMI ||
- qp->ibqp.qp_type == IB_QPT_GSI)
- atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
-
- rvt_qp_swqe_complete(qp,
- wqe,
- ib_hfi1_wc_opcode[wqe->wr.opcode],
- status);
-
- if (qp->s_acked == old_last)
- qp->s_acked = last;
- if (qp->s_cur == old_last)
- qp->s_cur = last;
- if (qp->s_tail == old_last)
- qp->s_tail = last;
- if (qp->state == IB_QPS_SQD && last == qp->s_cur)
- qp->s_draining = 0;
-}
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 88e326d..c61b602 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -65,6 +65,7 @@
#define SDMA_DESCQ_CNT 2048
#define SDMA_DESC_INTR 64
#define INVALID_TAIL 0xffff
+#define SDMA_PAD max_t(size_t, MAX_16B_PADDING, sizeof(u32))
static uint sdma_descq_cnt = SDMA_DESCQ_CNT;
module_param(sdma_descq_cnt, uint, S_IRUGO);
@@ -378,7 +379,7 @@
__sdma_txclean(sde->dd, tx);
if (complete)
(*complete)(tx, res);
- if (wait && iowait_sdma_dec(wait))
+ if (iowait_sdma_dec(wait))
iowait_drain_wakeup(wait);
}
@@ -405,19 +406,33 @@
struct sdma_txreq *txp, *txp_next;
LIST_HEAD(flushlist);
unsigned long flags;
+ uint seq;
/* flush from head to tail */
sdma_flush_descq(sde);
spin_lock_irqsave(&sde->flushlist_lock, flags);
/* copy flush list */
- list_for_each_entry_safe(txp, txp_next, &sde->flushlist, list) {
- list_del_init(&txp->list);
- list_add_tail(&txp->list, &flushlist);
- }
+ list_splice_init(&sde->flushlist, &flushlist);
spin_unlock_irqrestore(&sde->flushlist_lock, flags);
/* flush from flush list */
list_for_each_entry_safe(txp, txp_next, &flushlist, list)
complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED);
+ /* wakeup QPs orphaned on the dmawait list */
+ do {
+ struct iowait *w, *nw;
+
+ seq = read_seqbegin(&sde->waitlock);
+ if (!list_empty(&sde->dmawait)) {
+ write_seqlock(&sde->waitlock);
+ list_for_each_entry_safe(w, nw, &sde->dmawait, list) {
+ if (w->wakeup) {
+ w->wakeup(w, SDMA_AVAIL_REASON);
+ list_del_init(&w->list);
+ }
+ }
+ write_sequnlock(&sde->waitlock);
+ }
+ } while (read_seqretry(&sde->waitlock, seq));
}
/*
@@ -855,14 +870,13 @@
{
struct sdma_rht_node *rht_node;
struct sdma_engine *sde = NULL;
- const struct cpumask *current_mask = ¤t->cpus_allowed;
unsigned long cpu_id;
/*
* To ensure that always the same sdma engine(s) will be
* selected make sure the process is pinned to this CPU only.
*/
- if (cpumask_weight(current_mask) != 1)
+ if (current->nr_cpus_allowed != 1)
goto out;
cpu_id = smp_processor_id();
@@ -1283,7 +1297,7 @@
struct sdma_engine *sde;
if (dd->sdma_pad_dma) {
- dma_free_coherent(&dd->pcidev->dev, 4,
+ dma_free_coherent(&dd->pcidev->dev, SDMA_PAD,
(void *)dd->sdma_pad_dma,
dd->sdma_pad_phys);
dd->sdma_pad_dma = NULL;
@@ -1424,6 +1438,7 @@
seqlock_init(&sde->head_lock);
spin_lock_init(&sde->senddmactrl_lock);
spin_lock_init(&sde->flushlist_lock);
+ seqlock_init(&sde->waitlock);
/* insure there is always a zero bit */
sde->ahg_bits = 0xfffffffe00000000ULL;
@@ -1452,12 +1467,9 @@
timer_setup(&sde->err_progress_check_timer,
sdma_err_progress_check, 0);
- sde->descq = dma_zalloc_coherent(
- &dd->pcidev->dev,
- descq_cnt * sizeof(u64[2]),
- &sde->descq_phys,
- GFP_KERNEL
- );
+ sde->descq = dma_alloc_coherent(&dd->pcidev->dev,
+ descq_cnt * sizeof(u64[2]),
+ &sde->descq_phys, GFP_KERNEL);
if (!sde->descq)
goto bail;
sde->tx_ring =
@@ -1470,24 +1482,18 @@
dd->sdma_heads_size = L1_CACHE_BYTES * num_engines;
/* Allocate memory for DMA of head registers to memory */
- dd->sdma_heads_dma = dma_zalloc_coherent(
- &dd->pcidev->dev,
- dd->sdma_heads_size,
- &dd->sdma_heads_phys,
- GFP_KERNEL
- );
+ dd->sdma_heads_dma = dma_alloc_coherent(&dd->pcidev->dev,
+ dd->sdma_heads_size,
+ &dd->sdma_heads_phys,
+ GFP_KERNEL);
if (!dd->sdma_heads_dma) {
dd_dev_err(dd, "failed to allocate SendDMA head memory\n");
goto bail;
}
/* Allocate memory for pad */
- dd->sdma_pad_dma = dma_zalloc_coherent(
- &dd->pcidev->dev,
- sizeof(u32),
- &dd->sdma_pad_phys,
- GFP_KERNEL
- );
+ dd->sdma_pad_dma = dma_alloc_coherent(&dd->pcidev->dev, SDMA_PAD,
+ &dd->sdma_pad_phys, GFP_KERNEL);
if (!dd->sdma_pad_dma) {
dd_dev_err(dd, "failed to allocate SendDMA pad memory\n");
goto bail;
@@ -1521,8 +1527,11 @@
}
ret = rhashtable_init(tmp_sdma_rht, &sdma_rht_params);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(tmp_sdma_rht);
goto bail;
+ }
+
dd->sdma_rht = tmp_sdma_rht;
dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma);
@@ -1755,12 +1764,9 @@
*/
static void sdma_desc_avail(struct sdma_engine *sde, uint avail)
{
- struct iowait *wait, *nw;
+ struct iowait *wait, *nw, *twait;
struct iowait *waits[SDMA_WAIT_BATCH_SIZE];
- uint i, n = 0, seq, max_idx = 0;
- struct sdma_txreq *stx;
- struct hfi1_ibdev *dev = &sde->dd->verbs_dev;
- u8 max_starved_cnt = 0;
+ uint i, n = 0, seq, tidx = 0;
#ifdef CONFIG_SDMA_VERBOSITY
dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx,
@@ -1769,49 +1775,50 @@
#endif
do {
- seq = read_seqbegin(&dev->iowait_lock);
+ seq = read_seqbegin(&sde->waitlock);
if (!list_empty(&sde->dmawait)) {
/* at least one item */
- write_seqlock(&dev->iowait_lock);
+ write_seqlock(&sde->waitlock);
/* Harvest waiters wanting DMA descriptors */
list_for_each_entry_safe(
wait,
nw,
&sde->dmawait,
list) {
- u16 num_desc = 0;
+ u32 num_desc;
if (!wait->wakeup)
continue;
if (n == ARRAY_SIZE(waits))
break;
- if (!list_empty(&wait->tx_head)) {
- stx = list_first_entry(
- &wait->tx_head,
- struct sdma_txreq,
- list);
- num_desc = stx->num_desc;
- }
+ iowait_init_priority(wait);
+ num_desc = iowait_get_all_desc(wait);
if (num_desc > avail)
break;
avail -= num_desc;
- /* Find the most starved wait memeber */
- iowait_starve_find_max(wait, &max_starved_cnt,
- n, &max_idx);
+ /* Find the top-priority wait memeber */
+ if (n) {
+ twait = waits[tidx];
+ tidx =
+ iowait_priority_update_top(wait,
+ twait,
+ n,
+ tidx);
+ }
list_del_init(&wait->list);
waits[n++] = wait;
}
- write_sequnlock(&dev->iowait_lock);
+ write_sequnlock(&sde->waitlock);
break;
}
- } while (read_seqretry(&dev->iowait_lock, seq));
+ } while (read_seqretry(&sde->waitlock, seq));
- /* Schedule the most starved one first */
+ /* Schedule the top-priority entry first */
if (n)
- waits[max_idx]->wakeup(waits[max_idx], SDMA_AVAIL_REASON);
+ waits[tidx]->wakeup(waits[tidx], SDMA_AVAIL_REASON);
for (i = 0; i < n; i++)
- if (i != max_idx)
+ if (i != tidx)
waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON);
}
@@ -2346,7 +2353,7 @@
*/
static int sdma_check_progress(
struct sdma_engine *sde,
- struct iowait *wait,
+ struct iowait_work *wait,
struct sdma_txreq *tx,
bool pkts_sent)
{
@@ -2356,12 +2363,12 @@
if (tx->num_desc <= sde->desc_avail)
return -EAGAIN;
/* pulse the head_lock */
- if (wait && wait->sleep) {
+ if (wait && iowait_ioww_to_iow(wait)->sleep) {
unsigned seq;
seq = raw_seqcount_begin(
(const seqcount_t *)&sde->head_lock.seqcount);
- ret = wait->sleep(sde, wait, tx, seq, pkts_sent);
+ ret = wait->iow->sleep(sde, wait, tx, seq, pkts_sent);
if (ret == -EAGAIN)
sde->desc_avail = sdma_descq_freecnt(sde);
} else {
@@ -2373,7 +2380,7 @@
/**
* sdma_send_txreq() - submit a tx req to ring
* @sde: sdma engine to use
- * @wait: wait structure to use when full (may be NULL)
+ * @wait: SE wait structure to use when full (may be NULL)
* @tx: sdma_txreq to submit
* @pkts_sent: has any packet been sent yet?
*
@@ -2386,7 +2393,7 @@
* -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
*/
int sdma_send_txreq(struct sdma_engine *sde,
- struct iowait *wait,
+ struct iowait_work *wait,
struct sdma_txreq *tx,
bool pkts_sent)
{
@@ -2397,7 +2404,7 @@
/* user should have supplied entire packet */
if (unlikely(tx->tlen))
return -EINVAL;
- tx->wait = wait;
+ tx->wait = iowait_ioww_to_iow(wait);
spin_lock_irqsave(&sde->tail_lock, flags);
retry:
if (unlikely(!__sdma_running(sde)))
@@ -2406,14 +2413,14 @@
goto nodesc;
tail = submit_tx(sde, tx);
if (wait)
- iowait_sdma_inc(wait);
+ iowait_sdma_inc(iowait_ioww_to_iow(wait));
sdma_update_tail(sde, tail);
unlock:
spin_unlock_irqrestore(&sde->tail_lock, flags);
return ret;
unlock_noconn:
if (wait)
- iowait_sdma_inc(wait);
+ iowait_sdma_inc(iowait_ioww_to_iow(wait));
tx->next_descq_idx = 0;
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
tx->sn = sde->tail_sn++;
@@ -2422,11 +2429,8 @@
spin_lock(&sde->flushlist_lock);
list_add_tail(&tx->list, &sde->flushlist);
spin_unlock(&sde->flushlist_lock);
- if (wait) {
- wait->tx_count++;
- wait->count += tx->num_desc;
- }
- schedule_work(&sde->flush_worker);
+ iowait_inc_wait_count(wait, tx->num_desc);
+ queue_work_on(sde->cpu, system_highpri_wq, &sde->flush_worker);
ret = -ECOMM;
goto unlock;
nodesc:
@@ -2442,9 +2446,9 @@
/**
* sdma_send_txlist() - submit a list of tx req to ring
* @sde: sdma engine to use
- * @wait: wait structure to use when full (may be NULL)
+ * @wait: SE wait structure to use when full (may be NULL)
* @tx_list: list of sdma_txreqs to submit
- * @count: pointer to a u32 which, after return will contain the total number of
+ * @count: pointer to a u16 which, after return will contain the total number of
* sdma_txreqs removed from the tx_list. This will include sdma_txreqs
* whose SDMA descriptors are submitted to the ring and the sdma_txreqs
* which are added to SDMA engine flush list if the SDMA engine state is
@@ -2467,8 +2471,8 @@
* -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL)
* -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state
*/
-int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait,
- struct list_head *tx_list, u32 *count_out)
+int sdma_send_txlist(struct sdma_engine *sde, struct iowait_work *wait,
+ struct list_head *tx_list, u16 *count_out)
{
struct sdma_txreq *tx, *tx_next;
int ret = 0;
@@ -2479,7 +2483,7 @@
spin_lock_irqsave(&sde->tail_lock, flags);
retry:
list_for_each_entry_safe(tx, tx_next, tx_list, list) {
- tx->wait = wait;
+ tx->wait = iowait_ioww_to_iow(wait);
if (unlikely(!__sdma_running(sde)))
goto unlock_noconn;
if (unlikely(tx->num_desc > sde->desc_avail))
@@ -2500,8 +2504,9 @@
update_tail:
total_count = submit_count + flush_count;
if (wait) {
- iowait_sdma_add(wait, total_count);
- iowait_starve_clear(submit_count > 0, wait);
+ iowait_sdma_add(iowait_ioww_to_iow(wait), total_count);
+ iowait_starve_clear(submit_count > 0,
+ iowait_ioww_to_iow(wait));
}
if (tail != INVALID_TAIL)
sdma_update_tail(sde, tail);
@@ -2511,7 +2516,7 @@
unlock_noconn:
spin_lock(&sde->flushlist_lock);
list_for_each_entry_safe(tx, tx_next, tx_list, list) {
- tx->wait = wait;
+ tx->wait = iowait_ioww_to_iow(wait);
list_del_init(&tx->list);
tx->next_descq_idx = 0;
#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER
@@ -2520,13 +2525,10 @@
#endif
list_add_tail(&tx->list, &sde->flushlist);
flush_count++;
- if (wait) {
- wait->tx_count++;
- wait->count += tx->num_desc;
- }
+ iowait_inc_wait_count(wait, tx->num_desc);
}
spin_unlock(&sde->flushlist_lock);
- schedule_work(&sde->flush_worker);
+ queue_work_on(sde->cpu, system_highpri_wq, &sde->flush_worker);
ret = -ECOMM;
goto update_tail;
nodesc:
diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h
index 46c775f..1e2e40f 100644
--- a/drivers/infiniband/hw/hfi1/sdma.h
+++ b/drivers/infiniband/hw/hfi1/sdma.h
@@ -1,7 +1,7 @@
#ifndef _HFI1_SDMA_H
#define _HFI1_SDMA_H
/*
- * Copyright(c) 2015, 2016 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -62,16 +62,6 @@
/* Hardware limit for SDMA packet size */
#define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1)
-#define SDMA_TXREQ_S_OK 0
-#define SDMA_TXREQ_S_SENDERROR 1
-#define SDMA_TXREQ_S_ABORTED 2
-#define SDMA_TXREQ_S_SHUTDOWN 3
-
-/* flags bits */
-#define SDMA_TXREQ_F_URGENT 0x0001
-#define SDMA_TXREQ_F_AHG_COPY 0x0002
-#define SDMA_TXREQ_F_USE_AHG 0x0004
-
#define SDMA_MAP_NONE 0
#define SDMA_MAP_SINGLE 1
#define SDMA_MAP_PAGE 2
@@ -392,6 +382,7 @@
u64 progress_int_cnt;
/* private: */
+ seqlock_t waitlock;
struct list_head dmawait;
/* CONFIG SDMA for now, just blindly duplicate */
@@ -415,6 +406,7 @@
struct list_head flushlist;
struct cpumask cpu_mask;
struct kobject kobj;
+ u32 msix_intr;
};
int sdma_init(struct hfi1_devdata *dd, u8 port);
@@ -849,16 +841,16 @@
dd, SDMA_MAP_SINGLE, tx, addr, len);
}
-struct iowait;
+struct iowait_work;
int sdma_send_txreq(struct sdma_engine *sde,
- struct iowait *wait,
+ struct iowait_work *wait,
struct sdma_txreq *tx,
bool pkts_sent);
int sdma_send_txlist(struct sdma_engine *sde,
- struct iowait *wait,
+ struct iowait_work *wait,
struct list_head *tx_list,
- u32 *count);
+ u16 *count_out);
int sdma_ahg_alloc(struct sdma_engine *sde);
void sdma_ahg_free(struct sdma_engine *sde, int ahg_index);
diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h
index bf7d777..514a478 100644
--- a/drivers/infiniband/hw/hfi1/sdma_txreq.h
+++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h
@@ -91,6 +91,7 @@
#define SDMA_TXREQ_F_URGENT 0x0001
#define SDMA_TXREQ_F_AHG_COPY 0x0002
#define SDMA_TXREQ_F_USE_AHG 0x0004
+#define SDMA_TXREQ_F_VIP 0x0010
struct sdma_txreq;
typedef void (*callback_t)(struct sdma_txreq *, int);
diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c
index 25e8673..90f62c4 100644
--- a/drivers/infiniband/hw/hfi1/sysfs.c
+++ b/drivers/infiniband/hw/hfi1/sysfs.c
@@ -494,20 +494,21 @@
* Start of per-unit (or driver, in some cases, but replicated
* per unit) functions (these get a device *)
*/
-static ssize_t show_rev(struct device *device, struct device_attribute *attr,
- char *buf)
+static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr,
+ char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
}
+static DEVICE_ATTR_RO(hw_rev);
-static ssize_t show_hfi(struct device *device, struct device_attribute *attr,
- char *buf)
+static ssize_t board_id_show(struct device *device,
+ struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
int ret;
@@ -517,23 +518,25 @@
ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
return ret;
}
+static DEVICE_ATTR_RO(board_id);
-static ssize_t show_boardversion(struct device *device,
+static ssize_t boardversion_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
/* The string printed here is already newline-terminated. */
return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
}
+static DEVICE_ATTR_RO(boardversion);
-static ssize_t show_nctxts(struct device *device,
+static ssize_t nctxts_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
/*
@@ -546,34 +549,37 @@
min(dd->num_user_contexts,
(u32)dd->sc_sizes[SC_USER].count));
}
+static DEVICE_ATTR_RO(nctxts);
-static ssize_t show_nfreectxts(struct device *device,
+static ssize_t nfreectxts_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
/* Return the number of free user ports (contexts) available. */
return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
}
+static DEVICE_ATTR_RO(nfreectxts);
-static ssize_t show_serial(struct device *device,
+static ssize_t serial_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
return scnprintf(buf, PAGE_SIZE, "%s", dd->serial);
}
+static DEVICE_ATTR_RO(serial);
-static ssize_t store_chip_reset(struct device *device,
+static ssize_t chip_reset_store(struct device *device,
struct device_attribute *attr, const char *buf,
size_t count)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
int ret;
@@ -586,6 +592,7 @@
bail:
return ret < 0 ? ret : count;
}
+static DEVICE_ATTR_WO(chip_reset);
/*
* Convert the reported temperature from an integer (reported in
@@ -598,11 +605,11 @@
/*
* Dump tempsense values, in decimal, to ease shell-scripts.
*/
-static ssize_t show_tempsense(struct device *device,
+static ssize_t tempsense_show(struct device *device,
struct device_attribute *attr, char *buf)
{
struct hfi1_ibdev *dev =
- container_of(device, struct hfi1_ibdev, rdi.ibdev.dev);
+ rdma_device_to_drv_device(device, struct hfi1_ibdev, rdi.ibdev);
struct hfi1_devdata *dd = dd_from_dev(dev);
struct hfi1_temp temp;
int ret;
@@ -622,6 +629,7 @@
}
return ret;
}
+static DEVICE_ATTR_RO(tempsense);
/*
* end of per-unit (or driver, in some cases, but replicated
@@ -629,24 +637,20 @@
*/
/* start of per-unit file structures and support code */
-static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
-static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL);
-static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
-static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
-static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
-static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
-static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
-static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
+static struct attribute *hfi1_attributes[] = {
+ &dev_attr_hw_rev.attr,
+ &dev_attr_board_id.attr,
+ &dev_attr_nctxts.attr,
+ &dev_attr_nfreectxts.attr,
+ &dev_attr_serial.attr,
+ &dev_attr_boardversion.attr,
+ &dev_attr_tempsense.attr,
+ &dev_attr_chip_reset.attr,
+ NULL,
+};
-static struct device_attribute *hfi1_attributes[] = {
- &dev_attr_hw_rev,
- &dev_attr_board_id,
- &dev_attr_nctxts,
- &dev_attr_nfreectxts,
- &dev_attr_serial,
- &dev_attr_boardversion,
- &dev_attr_tempsense,
- &dev_attr_chip_reset,
+const struct attribute_group ib_hfi1_attr_group = {
+ .attrs = hfi1_attributes,
};
int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num,
@@ -832,12 +836,6 @@
struct device *class_dev = &dev->dev;
int i, j, ret;
- for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) {
- ret = device_create_file(&dev->dev, hfi1_attributes[i]);
- if (ret)
- goto bail;
- }
-
for (i = 0; i < dd->num_sdma; i++) {
ret = kobject_init_and_add(&dd->per_sdma[i].kobj,
&sde_ktype, &class_dev->kobj,
@@ -855,9 +853,6 @@
return 0;
bail:
- for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i)
- device_remove_file(&dev->dev, hfi1_attributes[i]);
-
for (i = 0; i < dd->num_sdma; i++)
kobject_del(&dd->per_sdma[i].kobj);
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
new file mode 100644
index 0000000..e53f542
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -0,0 +1,5507 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+
+#include "hfi.h"
+#include "qp.h"
+#include "rc.h"
+#include "verbs.h"
+#include "tid_rdma.h"
+#include "exp_rcv.h"
+#include "trace.h"
+
+/**
+ * DOC: TID RDMA READ protocol
+ *
+ * This is an end-to-end protocol at the hfi1 level between two nodes that
+ * improves performance by avoiding data copy on the requester side. It
+ * converts a qualified RDMA READ request into a TID RDMA READ request on
+ * the requester side and thereafter handles the request and response
+ * differently. To be qualified, the RDMA READ request should meet the
+ * following:
+ * -- The total data length should be greater than 256K;
+ * -- The total data length should be a multiple of 4K page size;
+ * -- Each local scatter-gather entry should be 4K page aligned;
+ * -- Each local scatter-gather entry should be a multiple of 4K page size;
+ */
+
+#define RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK BIT_ULL(32)
+#define RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK BIT_ULL(33)
+#define RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK BIT_ULL(34)
+#define RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK BIT_ULL(35)
+#define RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK BIT_ULL(37)
+#define RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK BIT_ULL(38)
+
+/* Maximum number of packets within a flow generation. */
+#define MAX_TID_FLOW_PSN BIT(HFI1_KDETH_BTH_SEQ_SHIFT)
+
+#define GENERATION_MASK 0xFFFFF
+
+static u32 mask_generation(u32 a)
+{
+ return a & GENERATION_MASK;
+}
+
+/* Reserved generation value to set to unused flows for kernel contexts */
+#define KERN_GENERATION_RESERVED mask_generation(U32_MAX)
+
+/*
+ * J_KEY for kernel contexts when TID RDMA is used.
+ * See generate_jkey() in hfi.h for more information.
+ */
+#define TID_RDMA_JKEY 32
+#define HFI1_KERNEL_MIN_JKEY HFI1_ADMIN_JKEY_RANGE
+#define HFI1_KERNEL_MAX_JKEY (2 * HFI1_ADMIN_JKEY_RANGE - 1)
+
+/* Maximum number of segments in flight per QP request. */
+#define TID_RDMA_MAX_READ_SEGS_PER_REQ 6
+#define TID_RDMA_MAX_WRITE_SEGS_PER_REQ 4
+#define MAX_REQ max_t(u16, TID_RDMA_MAX_READ_SEGS_PER_REQ, \
+ TID_RDMA_MAX_WRITE_SEGS_PER_REQ)
+#define MAX_FLOWS roundup_pow_of_two(MAX_REQ + 1)
+
+#define MAX_EXPECTED_PAGES (MAX_EXPECTED_BUFFER / PAGE_SIZE)
+
+#define TID_RDMA_DESTQP_FLOW_SHIFT 11
+#define TID_RDMA_DESTQP_FLOW_MASK 0x1f
+
+#define TID_OPFN_QP_CTXT_MASK 0xff
+#define TID_OPFN_QP_CTXT_SHIFT 56
+#define TID_OPFN_QP_KDETH_MASK 0xff
+#define TID_OPFN_QP_KDETH_SHIFT 48
+#define TID_OPFN_MAX_LEN_MASK 0x7ff
+#define TID_OPFN_MAX_LEN_SHIFT 37
+#define TID_OPFN_TIMEOUT_MASK 0x1f
+#define TID_OPFN_TIMEOUT_SHIFT 32
+#define TID_OPFN_RESERVED_MASK 0x3f
+#define TID_OPFN_RESERVED_SHIFT 26
+#define TID_OPFN_URG_MASK 0x1
+#define TID_OPFN_URG_SHIFT 25
+#define TID_OPFN_VER_MASK 0x7
+#define TID_OPFN_VER_SHIFT 22
+#define TID_OPFN_JKEY_MASK 0x3f
+#define TID_OPFN_JKEY_SHIFT 16
+#define TID_OPFN_MAX_READ_MASK 0x3f
+#define TID_OPFN_MAX_READ_SHIFT 10
+#define TID_OPFN_MAX_WRITE_MASK 0x3f
+#define TID_OPFN_MAX_WRITE_SHIFT 4
+
+/*
+ * OPFN TID layout
+ *
+ * 63 47 31 15
+ * NNNNNNNNKKKKKKKK MMMMMMMMMMMTTTTT DDDDDDUVVVJJJJJJ RRRRRRWWWWWWCCCC
+ * 3210987654321098 7654321098765432 1098765432109876 5432109876543210
+ * N - the context Number
+ * K - the Kdeth_qp
+ * M - Max_len
+ * T - Timeout
+ * D - reserveD
+ * V - version
+ * U - Urg capable
+ * J - Jkey
+ * R - max_Read
+ * W - max_Write
+ * C - Capcode
+ */
+
+static void tid_rdma_trigger_resume(struct work_struct *work);
+static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req);
+static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
+ gfp_t gfp);
+static void hfi1_init_trdma_req(struct rvt_qp *qp,
+ struct tid_rdma_request *req);
+static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx);
+static void hfi1_tid_timeout(struct timer_list *t);
+static void hfi1_add_tid_reap_timer(struct rvt_qp *qp);
+static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp);
+static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp);
+static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp);
+static void hfi1_tid_retry_timeout(struct timer_list *t);
+static int make_tid_rdma_ack(struct rvt_qp *qp,
+ struct ib_other_headers *ohdr,
+ struct hfi1_pkt_state *ps);
+static void hfi1_do_tid_send(struct rvt_qp *qp);
+static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx);
+static void tid_rdma_rcv_err(struct hfi1_packet *packet,
+ struct ib_other_headers *ohdr,
+ struct rvt_qp *qp, u32 psn, int diff, bool fecn);
+static void update_r_next_psn_fecn(struct hfi1_packet *packet,
+ struct hfi1_qp_priv *priv,
+ struct hfi1_ctxtdata *rcd,
+ struct tid_rdma_flow *flow,
+ bool fecn);
+
+static void validate_r_tid_ack(struct hfi1_qp_priv *priv)
+{
+ if (priv->r_tid_ack == HFI1_QP_WQE_INVALID)
+ priv->r_tid_ack = priv->r_tid_tail;
+}
+
+static void tid_rdma_schedule_ack(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ priv->s_flags |= RVT_S_ACK_PENDING;
+ hfi1_schedule_tid_send(qp);
+}
+
+static void tid_rdma_trigger_ack(struct rvt_qp *qp)
+{
+ validate_r_tid_ack(qp->priv);
+ tid_rdma_schedule_ack(qp);
+}
+
+static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
+{
+ return
+ (((u64)p->qp & TID_OPFN_QP_CTXT_MASK) <<
+ TID_OPFN_QP_CTXT_SHIFT) |
+ ((((u64)p->qp >> 16) & TID_OPFN_QP_KDETH_MASK) <<
+ TID_OPFN_QP_KDETH_SHIFT) |
+ (((u64)((p->max_len >> PAGE_SHIFT) - 1) &
+ TID_OPFN_MAX_LEN_MASK) << TID_OPFN_MAX_LEN_SHIFT) |
+ (((u64)p->timeout & TID_OPFN_TIMEOUT_MASK) <<
+ TID_OPFN_TIMEOUT_SHIFT) |
+ (((u64)p->urg & TID_OPFN_URG_MASK) << TID_OPFN_URG_SHIFT) |
+ (((u64)p->jkey & TID_OPFN_JKEY_MASK) << TID_OPFN_JKEY_SHIFT) |
+ (((u64)p->max_read & TID_OPFN_MAX_READ_MASK) <<
+ TID_OPFN_MAX_READ_SHIFT) |
+ (((u64)p->max_write & TID_OPFN_MAX_WRITE_MASK) <<
+ TID_OPFN_MAX_WRITE_SHIFT);
+}
+
+static void tid_rdma_opfn_decode(struct tid_rdma_params *p, u64 data)
+{
+ p->max_len = (((data >> TID_OPFN_MAX_LEN_SHIFT) &
+ TID_OPFN_MAX_LEN_MASK) + 1) << PAGE_SHIFT;
+ p->jkey = (data >> TID_OPFN_JKEY_SHIFT) & TID_OPFN_JKEY_MASK;
+ p->max_write = (data >> TID_OPFN_MAX_WRITE_SHIFT) &
+ TID_OPFN_MAX_WRITE_MASK;
+ p->max_read = (data >> TID_OPFN_MAX_READ_SHIFT) &
+ TID_OPFN_MAX_READ_MASK;
+ p->qp =
+ ((((data >> TID_OPFN_QP_KDETH_SHIFT) & TID_OPFN_QP_KDETH_MASK)
+ << 16) |
+ ((data >> TID_OPFN_QP_CTXT_SHIFT) & TID_OPFN_QP_CTXT_MASK));
+ p->urg = (data >> TID_OPFN_URG_SHIFT) & TID_OPFN_URG_MASK;
+ p->timeout = (data >> TID_OPFN_TIMEOUT_SHIFT) & TID_OPFN_TIMEOUT_MASK;
+}
+
+void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ p->qp = (kdeth_qp << 16) | priv->rcd->ctxt;
+ p->max_len = TID_RDMA_MAX_SEGMENT_SIZE;
+ p->jkey = priv->rcd->jkey;
+ p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ;
+ p->max_write = TID_RDMA_MAX_WRITE_SEGS_PER_REQ;
+ p->timeout = qp->timeout;
+ p->urg = is_urg_masked(priv->rcd);
+}
+
+bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ *data = tid_rdma_opfn_encode(&priv->tid_rdma.local);
+ return true;
+}
+
+bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct tid_rdma_params *remote, *old;
+ bool ret = true;
+
+ old = rcu_dereference_protected(priv->tid_rdma.remote,
+ lockdep_is_held(&priv->opfn.lock));
+ data &= ~0xfULL;
+ /*
+ * If data passed in is zero, return true so as not to continue the
+ * negotiation process
+ */
+ if (!data || !HFI1_CAP_IS_KSET(TID_RDMA))
+ goto null;
+ /*
+ * If kzalloc fails, return false. This will result in:
+ * * at the requester a new OPFN request being generated to retry
+ * the negotiation
+ * * at the responder, 0 being returned to the requester so as to
+ * disable TID RDMA at both the requester and the responder
+ */
+ remote = kzalloc(sizeof(*remote), GFP_ATOMIC);
+ if (!remote) {
+ ret = false;
+ goto null;
+ }
+
+ tid_rdma_opfn_decode(remote, data);
+ priv->tid_timer_timeout_jiffies =
+ usecs_to_jiffies((((4096UL * (1UL << remote->timeout)) /
+ 1000UL) << 3) * 7);
+ trace_hfi1_opfn_param(qp, 0, &priv->tid_rdma.local);
+ trace_hfi1_opfn_param(qp, 1, remote);
+ rcu_assign_pointer(priv->tid_rdma.remote, remote);
+ /*
+ * A TID RDMA READ request's segment size is not equal to
+ * remote->max_len only when the request's data length is smaller
+ * than remote->max_len. In that case, there will be only one segment.
+ * Therefore, when priv->pkts_ps is used to calculate req->cur_seg
+ * during retry, it will lead to req->cur_seg = 0, which is exactly
+ * what is expected.
+ */
+ priv->pkts_ps = (u16)rvt_div_mtu(qp, remote->max_len);
+ priv->timeout_shift = ilog2(priv->pkts_ps - 1) + 1;
+ goto free;
+null:
+ RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
+ priv->timeout_shift = 0;
+free:
+ if (old)
+ kfree_rcu(old, rcu_head);
+ return ret;
+}
+
+bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data)
+{
+ bool ret;
+
+ ret = tid_rdma_conn_reply(qp, *data);
+ *data = 0;
+ /*
+ * If tid_rdma_conn_reply() returns error, set *data as 0 to indicate
+ * TID RDMA could not be enabled. This will result in TID RDMA being
+ * disabled at the requester too.
+ */
+ if (ret)
+ (void)tid_rdma_conn_req(qp, data);
+ return ret;
+}
+
+void tid_rdma_conn_error(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct tid_rdma_params *old;
+
+ old = rcu_dereference_protected(priv->tid_rdma.remote,
+ lockdep_is_held(&priv->opfn.lock));
+ RCU_INIT_POINTER(priv->tid_rdma.remote, NULL);
+ if (old)
+ kfree_rcu(old, rcu_head);
+}
+
+/* This is called at context initialization time */
+int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit)
+{
+ if (reinit)
+ return 0;
+
+ BUILD_BUG_ON(TID_RDMA_JKEY < HFI1_KERNEL_MIN_JKEY);
+ BUILD_BUG_ON(TID_RDMA_JKEY > HFI1_KERNEL_MAX_JKEY);
+ rcd->jkey = TID_RDMA_JKEY;
+ hfi1_set_ctxt_jkey(rcd->dd, rcd, rcd->jkey);
+ return hfi1_alloc_ctxt_rcv_groups(rcd);
+}
+
+/**
+ * qp_to_rcd - determine the receive context used by a qp
+ * @qp - the qp
+ *
+ * This routine returns the receive context associated
+ * with a a qp's qpn.
+ *
+ * Returns the context.
+ */
+static struct hfi1_ctxtdata *qp_to_rcd(struct rvt_dev_info *rdi,
+ struct rvt_qp *qp)
+{
+ struct hfi1_ibdev *verbs_dev = container_of(rdi,
+ struct hfi1_ibdev,
+ rdi);
+ struct hfi1_devdata *dd = container_of(verbs_dev,
+ struct hfi1_devdata,
+ verbs_dev);
+ unsigned int ctxt;
+
+ if (qp->ibqp.qp_num == 0)
+ ctxt = 0;
+ else
+ ctxt = hfi1_get_qp_map(dd, qp->ibqp.qp_num >> dd->qos_shift);
+ return dd->rcd[ctxt];
+}
+
+int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ struct ib_qp_init_attr *init_attr)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ int i, ret;
+
+ qpriv->rcd = qp_to_rcd(rdi, qp);
+
+ spin_lock_init(&qpriv->opfn.lock);
+ INIT_WORK(&qpriv->opfn.opfn_work, opfn_send_conn_request);
+ INIT_WORK(&qpriv->tid_rdma.trigger_work, tid_rdma_trigger_resume);
+ qpriv->flow_state.psn = 0;
+ qpriv->flow_state.index = RXE_NUM_TID_FLOWS;
+ qpriv->flow_state.last_index = RXE_NUM_TID_FLOWS;
+ qpriv->flow_state.generation = KERN_GENERATION_RESERVED;
+ qpriv->s_state = TID_OP(WRITE_RESP);
+ qpriv->s_tid_cur = HFI1_QP_WQE_INVALID;
+ qpriv->s_tid_head = HFI1_QP_WQE_INVALID;
+ qpriv->s_tid_tail = HFI1_QP_WQE_INVALID;
+ qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+ qpriv->r_tid_head = HFI1_QP_WQE_INVALID;
+ qpriv->r_tid_tail = HFI1_QP_WQE_INVALID;
+ qpriv->r_tid_ack = HFI1_QP_WQE_INVALID;
+ qpriv->r_tid_alloc = HFI1_QP_WQE_INVALID;
+ atomic_set(&qpriv->n_requests, 0);
+ atomic_set(&qpriv->n_tid_requests, 0);
+ timer_setup(&qpriv->s_tid_timer, hfi1_tid_timeout, 0);
+ timer_setup(&qpriv->s_tid_retry_timer, hfi1_tid_retry_timeout, 0);
+ INIT_LIST_HEAD(&qpriv->tid_wait);
+
+ if (init_attr->qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+ struct hfi1_devdata *dd = qpriv->rcd->dd;
+
+ qpriv->pages = kzalloc_node(TID_RDMA_MAX_PAGES *
+ sizeof(*qpriv->pages),
+ GFP_KERNEL, dd->node);
+ if (!qpriv->pages)
+ return -ENOMEM;
+ for (i = 0; i < qp->s_size; i++) {
+ struct hfi1_swqe_priv *priv;
+ struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
+
+ priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
+ dd->node);
+ if (!priv)
+ return -ENOMEM;
+
+ hfi1_init_trdma_req(qp, &priv->tid_req);
+ priv->tid_req.e.swqe = wqe;
+ wqe->priv = priv;
+ }
+ for (i = 0; i < rvt_max_atomic(rdi); i++) {
+ struct hfi1_ack_priv *priv;
+
+ priv = kzalloc_node(sizeof(*priv), GFP_KERNEL,
+ dd->node);
+ if (!priv)
+ return -ENOMEM;
+
+ hfi1_init_trdma_req(qp, &priv->tid_req);
+ priv->tid_req.e.ack = &qp->s_ack_queue[i];
+
+ ret = hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req,
+ GFP_KERNEL);
+ if (ret) {
+ kfree(priv);
+ return ret;
+ }
+ qp->s_ack_queue[i].priv = priv;
+ }
+ }
+
+ return 0;
+}
+
+void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct rvt_swqe *wqe;
+ u32 i;
+
+ if (qp->ibqp.qp_type == IB_QPT_RC && HFI1_CAP_IS_KSET(TID_RDMA)) {
+ for (i = 0; i < qp->s_size; i++) {
+ wqe = rvt_get_swqe_ptr(qp, i);
+ kfree(wqe->priv);
+ wqe->priv = NULL;
+ }
+ for (i = 0; i < rvt_max_atomic(rdi); i++) {
+ struct hfi1_ack_priv *priv = qp->s_ack_queue[i].priv;
+
+ if (priv)
+ hfi1_kern_exp_rcv_free_flows(&priv->tid_req);
+ kfree(priv);
+ qp->s_ack_queue[i].priv = NULL;
+ }
+ cancel_work_sync(&qpriv->opfn.opfn_work);
+ kfree(qpriv->pages);
+ qpriv->pages = NULL;
+ }
+}
+
+/* Flow and tid waiter functions */
+/**
+ * DOC: lock ordering
+ *
+ * There are two locks involved with the queuing
+ * routines: the qp s_lock and the exp_lock.
+ *
+ * Since the tid space allocation is called from
+ * the send engine, the qp s_lock is already held.
+ *
+ * The allocation routines will get the exp_lock.
+ *
+ * The first_qp() call is provided to allow the head of
+ * the rcd wait queue to be fetched under the exp_lock and
+ * followed by a drop of the exp_lock.
+ *
+ * Any qp in the wait list will have the qp reference count held
+ * to hold the qp in memory.
+ */
+
+/*
+ * return head of rcd wait list
+ *
+ * Must hold the exp_lock.
+ *
+ * Get a reference to the QP to hold the QP in memory.
+ *
+ * The caller must release the reference when the local
+ * is no longer being used.
+ */
+static struct rvt_qp *first_qp(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue)
+ __must_hold(&rcd->exp_lock)
+{
+ struct hfi1_qp_priv *priv;
+
+ lockdep_assert_held(&rcd->exp_lock);
+ priv = list_first_entry_or_null(&queue->queue_head,
+ struct hfi1_qp_priv,
+ tid_wait);
+ if (!priv)
+ return NULL;
+ rvt_get_qp(priv->owner);
+ return priv->owner;
+}
+
+/**
+ * kernel_tid_waiters - determine rcd wait
+ * @rcd: the receive context
+ * @qp: the head of the qp being processed
+ *
+ * This routine will return false IFF
+ * the list is NULL or the head of the
+ * list is the indicated qp.
+ *
+ * Must hold the qp s_lock and the exp_lock.
+ *
+ * Return:
+ * false if either of the conditions below are satisfied:
+ * 1. The list is empty or
+ * 2. The indicated qp is at the head of the list and the
+ * HFI1_S_WAIT_TID_SPACE bit is set in qp->s_flags.
+ * true is returned otherwise.
+ */
+static bool kernel_tid_waiters(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue, struct rvt_qp *qp)
+ __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+ struct rvt_qp *fqp;
+ bool ret = true;
+
+ lockdep_assert_held(&qp->s_lock);
+ lockdep_assert_held(&rcd->exp_lock);
+ fqp = first_qp(rcd, queue);
+ if (!fqp || (fqp == qp && (qp->s_flags & HFI1_S_WAIT_TID_SPACE)))
+ ret = false;
+ rvt_put_qp(fqp);
+ return ret;
+}
+
+/**
+ * dequeue_tid_waiter - dequeue the qp from the list
+ * @qp - the qp to remove the wait list
+ *
+ * This routine removes the indicated qp from the
+ * wait list if it is there.
+ *
+ * This should be done after the hardware flow and
+ * tid array resources have been allocated.
+ *
+ * Must hold the qp s_lock and the rcd exp_lock.
+ *
+ * It assumes the s_lock to protect the s_flags
+ * field and to reliably test the HFI1_S_WAIT_TID_SPACE flag.
+ */
+static void dequeue_tid_waiter(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue, struct rvt_qp *qp)
+ __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ lockdep_assert_held(&rcd->exp_lock);
+ if (list_empty(&priv->tid_wait))
+ return;
+ list_del_init(&priv->tid_wait);
+ qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+ queue->dequeue++;
+ rvt_put_qp(qp);
+}
+
+/**
+ * queue_qp_for_tid_wait - suspend QP on tid space
+ * @rcd: the receive context
+ * @qp: the qp
+ *
+ * The qp is inserted at the tail of the rcd
+ * wait queue and the HFI1_S_WAIT_TID_SPACE s_flag is set.
+ *
+ * Must hold the qp s_lock and the exp_lock.
+ */
+static void queue_qp_for_tid_wait(struct hfi1_ctxtdata *rcd,
+ struct tid_queue *queue, struct rvt_qp *qp)
+ __must_hold(&rcd->exp_lock) __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ lockdep_assert_held(&rcd->exp_lock);
+ if (list_empty(&priv->tid_wait)) {
+ qp->s_flags |= HFI1_S_WAIT_TID_SPACE;
+ list_add_tail(&priv->tid_wait, &queue->queue_head);
+ priv->tid_enqueue = ++queue->enqueue;
+ rcd->dd->verbs_dev.n_tidwait++;
+ trace_hfi1_qpsleep(qp, HFI1_S_WAIT_TID_SPACE);
+ rvt_get_qp(qp);
+ }
+}
+
+/**
+ * __trigger_tid_waiter - trigger tid waiter
+ * @qp: the qp
+ *
+ * This is a private entrance to schedule the qp
+ * assuming the caller is holding the qp->s_lock.
+ */
+static void __trigger_tid_waiter(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ lockdep_assert_held(&qp->s_lock);
+ if (!(qp->s_flags & HFI1_S_WAIT_TID_SPACE))
+ return;
+ trace_hfi1_qpwakeup(qp, HFI1_S_WAIT_TID_SPACE);
+ hfi1_schedule_send(qp);
+}
+
+/**
+ * tid_rdma_schedule_tid_wakeup - schedule wakeup for a qp
+ * @qp - the qp
+ *
+ * trigger a schedule or a waiting qp in a deadlock
+ * safe manner. The qp reference is held prior
+ * to this call via first_qp().
+ *
+ * If the qp trigger was already scheduled (!rval)
+ * the the reference is dropped, otherwise the resume
+ * or the destroy cancel will dispatch the reference.
+ */
+static void tid_rdma_schedule_tid_wakeup(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv;
+ struct hfi1_ibport *ibp;
+ struct hfi1_pportdata *ppd;
+ struct hfi1_devdata *dd;
+ bool rval;
+
+ if (!qp)
+ return;
+
+ priv = qp->priv;
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+ ppd = ppd_from_ibp(ibp);
+ dd = dd_from_ibdev(qp->ibqp.device);
+
+ rval = queue_work_on(priv->s_sde ?
+ priv->s_sde->cpu :
+ cpumask_first(cpumask_of_node(dd->node)),
+ ppd->hfi1_wq,
+ &priv->tid_rdma.trigger_work);
+ if (!rval)
+ rvt_put_qp(qp);
+}
+
+/**
+ * tid_rdma_trigger_resume - field a trigger work request
+ * @work - the work item
+ *
+ * Complete the off qp trigger processing by directly
+ * calling the progress routine.
+ */
+static void tid_rdma_trigger_resume(struct work_struct *work)
+{
+ struct tid_rdma_qp_params *tr;
+ struct hfi1_qp_priv *priv;
+ struct rvt_qp *qp;
+
+ tr = container_of(work, struct tid_rdma_qp_params, trigger_work);
+ priv = container_of(tr, struct hfi1_qp_priv, tid_rdma);
+ qp = priv->owner;
+ spin_lock_irq(&qp->s_lock);
+ if (qp->s_flags & HFI1_S_WAIT_TID_SPACE) {
+ spin_unlock_irq(&qp->s_lock);
+ hfi1_do_send(priv->owner, true);
+ } else {
+ spin_unlock_irq(&qp->s_lock);
+ }
+ rvt_put_qp(qp);
+}
+
+/**
+ * tid_rdma_flush_wait - unwind any tid space wait
+ *
+ * This is called when resetting a qp to
+ * allow a destroy or reset to get rid
+ * of any tid space linkage and reference counts.
+ */
+static void _tid_rdma_flush_wait(struct rvt_qp *qp, struct tid_queue *queue)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv;
+
+ if (!qp)
+ return;
+ lockdep_assert_held(&qp->s_lock);
+ priv = qp->priv;
+ qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+ spin_lock(&priv->rcd->exp_lock);
+ if (!list_empty(&priv->tid_wait)) {
+ list_del_init(&priv->tid_wait);
+ qp->s_flags &= ~HFI1_S_WAIT_TID_SPACE;
+ queue->dequeue++;
+ rvt_put_qp(qp);
+ }
+ spin_unlock(&priv->rcd->exp_lock);
+}
+
+void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ _tid_rdma_flush_wait(qp, &priv->rcd->flow_queue);
+ _tid_rdma_flush_wait(qp, &priv->rcd->rarr_queue);
+}
+
+/* Flow functions */
+/**
+ * kern_reserve_flow - allocate a hardware flow
+ * @rcd - the context to use for allocation
+ * @last - the index of the preferred flow. Use RXE_NUM_TID_FLOWS to
+ * signify "don't care".
+ *
+ * Use a bit mask based allocation to reserve a hardware
+ * flow for use in receiving KDETH data packets. If a preferred flow is
+ * specified the function will attempt to reserve that flow again, if
+ * available.
+ *
+ * The exp_lock must be held.
+ *
+ * Return:
+ * On success: a value postive value between 0 and RXE_NUM_TID_FLOWS - 1
+ * On failure: -EAGAIN
+ */
+static int kern_reserve_flow(struct hfi1_ctxtdata *rcd, int last)
+ __must_hold(&rcd->exp_lock)
+{
+ int nr;
+
+ /* Attempt to reserve the preferred flow index */
+ if (last >= 0 && last < RXE_NUM_TID_FLOWS &&
+ !test_and_set_bit(last, &rcd->flow_mask))
+ return last;
+
+ nr = ffz(rcd->flow_mask);
+ BUILD_BUG_ON(RXE_NUM_TID_FLOWS >=
+ (sizeof(rcd->flow_mask) * BITS_PER_BYTE));
+ if (nr > (RXE_NUM_TID_FLOWS - 1))
+ return -EAGAIN;
+ set_bit(nr, &rcd->flow_mask);
+ return nr;
+}
+
+static void kern_set_hw_flow(struct hfi1_ctxtdata *rcd, u32 generation,
+ u32 flow_idx)
+{
+ u64 reg;
+
+ reg = ((u64)generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
+ RCV_TID_FLOW_TABLE_CTRL_FLOW_VALID_SMASK |
+ RCV_TID_FLOW_TABLE_CTRL_KEEP_AFTER_SEQ_ERR_SMASK |
+ RCV_TID_FLOW_TABLE_CTRL_KEEP_ON_GEN_ERR_SMASK |
+ RCV_TID_FLOW_TABLE_STATUS_SEQ_MISMATCH_SMASK |
+ RCV_TID_FLOW_TABLE_STATUS_GEN_MISMATCH_SMASK;
+
+ if (generation != KERN_GENERATION_RESERVED)
+ reg |= RCV_TID_FLOW_TABLE_CTRL_HDR_SUPP_EN_SMASK;
+
+ write_uctxt_csr(rcd->dd, rcd->ctxt,
+ RCV_TID_FLOW_TABLE + 8 * flow_idx, reg);
+}
+
+static u32 kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
+ __must_hold(&rcd->exp_lock)
+{
+ u32 generation = rcd->flows[flow_idx].generation;
+
+ kern_set_hw_flow(rcd, generation, flow_idx);
+ return generation;
+}
+
+static u32 kern_flow_generation_next(u32 gen)
+{
+ u32 generation = mask_generation(gen + 1);
+
+ if (generation == KERN_GENERATION_RESERVED)
+ generation = mask_generation(generation + 1);
+ return generation;
+}
+
+static void kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, u32 flow_idx)
+ __must_hold(&rcd->exp_lock)
+{
+ rcd->flows[flow_idx].generation =
+ kern_flow_generation_next(rcd->flows[flow_idx].generation);
+ kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, flow_idx);
+}
+
+int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+ struct tid_flow_state *fs = &qpriv->flow_state;
+ struct rvt_qp *fqp;
+ unsigned long flags;
+ int ret = 0;
+
+ /* The QP already has an allocated flow */
+ if (fs->index != RXE_NUM_TID_FLOWS)
+ return ret;
+
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+ if (kernel_tid_waiters(rcd, &rcd->flow_queue, qp))
+ goto queue;
+
+ ret = kern_reserve_flow(rcd, fs->last_index);
+ if (ret < 0)
+ goto queue;
+ fs->index = ret;
+ fs->last_index = fs->index;
+
+ /* Generation received in a RESYNC overrides default flow generation */
+ if (fs->generation != KERN_GENERATION_RESERVED)
+ rcd->flows[fs->index].generation = fs->generation;
+ fs->generation = kern_setup_hw_flow(rcd, fs->index);
+ fs->psn = 0;
+ dequeue_tid_waiter(rcd, &rcd->flow_queue, qp);
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->flow_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+ tid_rdma_schedule_tid_wakeup(fqp);
+ return 0;
+queue:
+ queue_qp_for_tid_wait(rcd, &rcd->flow_queue, qp);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+ return -EAGAIN;
+}
+
+void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+ struct tid_flow_state *fs = &qpriv->flow_state;
+ struct rvt_qp *fqp;
+ unsigned long flags;
+
+ if (fs->index >= RXE_NUM_TID_FLOWS)
+ return;
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+ kern_clear_hw_flow(rcd, fs->index);
+ clear_bit(fs->index, &rcd->flow_mask);
+ fs->index = RXE_NUM_TID_FLOWS;
+ fs->psn = 0;
+ fs->generation = KERN_GENERATION_RESERVED;
+
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->flow_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+ if (fqp == qp) {
+ __trigger_tid_waiter(fqp);
+ rvt_put_qp(fqp);
+ } else {
+ tid_rdma_schedule_tid_wakeup(fqp);
+ }
+}
+
+void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd)
+{
+ int i;
+
+ for (i = 0; i < RXE_NUM_TID_FLOWS; i++) {
+ rcd->flows[i].generation = mask_generation(prandom_u32());
+ kern_set_hw_flow(rcd, KERN_GENERATION_RESERVED, i);
+ }
+}
+
+/* TID allocation functions */
+static u8 trdma_pset_order(struct tid_rdma_pageset *s)
+{
+ u8 count = s->count;
+
+ return ilog2(count) + 1;
+}
+
+/**
+ * tid_rdma_find_phys_blocks_4k - get groups base on mr info
+ * @npages - number of pages
+ * @pages - pointer to an array of page structs
+ * @list - page set array to return
+ *
+ * This routine returns the number of groups associated with
+ * the current sge information. This implementation is based
+ * on the expected receive find_phys_blocks() adjusted to
+ * use the MR information vs. the pfn.
+ *
+ * Return:
+ * the number of RcvArray entries
+ */
+static u32 tid_rdma_find_phys_blocks_4k(struct tid_rdma_flow *flow,
+ struct page **pages,
+ u32 npages,
+ struct tid_rdma_pageset *list)
+{
+ u32 pagecount, pageidx, setcount = 0, i;
+ void *vaddr, *this_vaddr;
+
+ if (!npages)
+ return 0;
+
+ /*
+ * Look for sets of physically contiguous pages in the user buffer.
+ * This will allow us to optimize Expected RcvArray entry usage by
+ * using the bigger supported sizes.
+ */
+ vaddr = page_address(pages[0]);
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, 0, 0, 0, vaddr);
+ for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
+ this_vaddr = i < npages ? page_address(pages[i]) : NULL;
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 0, 0,
+ this_vaddr);
+ /*
+ * If the vaddr's are not sequential, pages are not physically
+ * contiguous.
+ */
+ if (this_vaddr != (vaddr + PAGE_SIZE)) {
+ /*
+ * At this point we have to loop over the set of
+ * physically contiguous pages and break them down it
+ * sizes supported by the HW.
+ * There are two main constraints:
+ * 1. The max buffer size is MAX_EXPECTED_BUFFER.
+ * If the total set size is bigger than that
+ * program only a MAX_EXPECTED_BUFFER chunk.
+ * 2. The buffer size has to be a power of two. If
+ * it is not, round down to the closes power of
+ * 2 and program that size.
+ */
+ while (pagecount) {
+ int maxpages = pagecount;
+ u32 bufsize = pagecount * PAGE_SIZE;
+
+ if (bufsize > MAX_EXPECTED_BUFFER)
+ maxpages =
+ MAX_EXPECTED_BUFFER >>
+ PAGE_SHIFT;
+ else if (!is_power_of_2(bufsize))
+ maxpages =
+ rounddown_pow_of_two(bufsize) >>
+ PAGE_SHIFT;
+
+ list[setcount].idx = pageidx;
+ list[setcount].count = maxpages;
+ trace_hfi1_tid_pageset(flow->req->qp, setcount,
+ list[setcount].idx,
+ list[setcount].count);
+ pagecount -= maxpages;
+ pageidx += maxpages;
+ setcount++;
+ }
+ pageidx = i;
+ pagecount = 1;
+ vaddr = this_vaddr;
+ } else {
+ vaddr += PAGE_SIZE;
+ pagecount++;
+ }
+ }
+ /* insure we always return an even number of sets */
+ if (setcount & 1)
+ list[setcount++].count = 0;
+ return setcount;
+}
+
+/**
+ * tid_flush_pages - dump out pages into pagesets
+ * @list - list of pagesets
+ * @idx - pointer to current page index
+ * @pages - number of pages to dump
+ * @sets - current number of pagesset
+ *
+ * This routine flushes out accumuated pages.
+ *
+ * To insure an even number of sets the
+ * code may add a filler.
+ *
+ * This can happen with when pages is not
+ * a power of 2 or pages is a power of 2
+ * less than the maximum pages.
+ *
+ * Return:
+ * The new number of sets
+ */
+
+static u32 tid_flush_pages(struct tid_rdma_pageset *list,
+ u32 *idx, u32 pages, u32 sets)
+{
+ while (pages) {
+ u32 maxpages = pages;
+
+ if (maxpages > MAX_EXPECTED_PAGES)
+ maxpages = MAX_EXPECTED_PAGES;
+ else if (!is_power_of_2(maxpages))
+ maxpages = rounddown_pow_of_two(maxpages);
+ list[sets].idx = *idx;
+ list[sets++].count = maxpages;
+ *idx += maxpages;
+ pages -= maxpages;
+ }
+ /* might need a filler */
+ if (sets & 1)
+ list[sets++].count = 0;
+ return sets;
+}
+
+/**
+ * tid_rdma_find_phys_blocks_8k - get groups base on mr info
+ * @pages - pointer to an array of page structs
+ * @npages - number of pages
+ * @list - page set array to return
+ *
+ * This routine parses an array of pages to compute pagesets
+ * in an 8k compatible way.
+ *
+ * pages are tested two at a time, i, i + 1 for contiguous
+ * pages and i - 1 and i contiguous pages.
+ *
+ * If any condition is false, any accumlated pages are flushed and
+ * v0,v1 are emitted as separate PAGE_SIZE pagesets
+ *
+ * Otherwise, the current 8k is totaled for a future flush.
+ *
+ * Return:
+ * The number of pagesets
+ * list set with the returned number of pagesets
+ *
+ */
+static u32 tid_rdma_find_phys_blocks_8k(struct tid_rdma_flow *flow,
+ struct page **pages,
+ u32 npages,
+ struct tid_rdma_pageset *list)
+{
+ u32 idx, sets = 0, i;
+ u32 pagecnt = 0;
+ void *v0, *v1, *vm1;
+
+ if (!npages)
+ return 0;
+ for (idx = 0, i = 0, vm1 = NULL; i < npages; i += 2) {
+ /* get a new v0 */
+ v0 = page_address(pages[i]);
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 0, v0);
+ v1 = i + 1 < npages ?
+ page_address(pages[i + 1]) : NULL;
+ trace_hfi1_tid_flow_page(flow->req->qp, flow, i, 1, 1, v1);
+ /* compare i, i + 1 vaddr */
+ if (v1 != (v0 + PAGE_SIZE)) {
+ /* flush out pages */
+ sets = tid_flush_pages(list, &idx, pagecnt, sets);
+ /* output v0,v1 as two pagesets */
+ list[sets].idx = idx++;
+ list[sets++].count = 1;
+ if (v1) {
+ list[sets].count = 1;
+ list[sets++].idx = idx++;
+ } else {
+ list[sets++].count = 0;
+ }
+ vm1 = NULL;
+ pagecnt = 0;
+ continue;
+ }
+ /* i,i+1 consecutive, look at i-1,i */
+ if (vm1 && v0 != (vm1 + PAGE_SIZE)) {
+ /* flush out pages */
+ sets = tid_flush_pages(list, &idx, pagecnt, sets);
+ pagecnt = 0;
+ }
+ /* pages will always be a multiple of 8k */
+ pagecnt += 2;
+ /* save i-1 */
+ vm1 = v1;
+ /* move to next pair */
+ }
+ /* dump residual pages at end */
+ sets = tid_flush_pages(list, &idx, npages - idx, sets);
+ /* by design cannot be odd sets */
+ WARN_ON(sets & 1);
+ return sets;
+}
+
+/**
+ * Find pages for one segment of a sge array represented by @ss. The function
+ * does not check the sge, the sge must have been checked for alignment with a
+ * prior call to hfi1_kern_trdma_ok. Other sge checking is done as part of
+ * rvt_lkey_ok and rvt_rkey_ok. Also, the function only modifies the local sge
+ * copy maintained in @ss->sge, the original sge is not modified.
+ *
+ * Unlike IB RDMA WRITE, we can't decrement ss->num_sge here because we are not
+ * releasing the MR reference count at the same time. Otherwise, we'll "leak"
+ * references to the MR. This difference requires that we keep track of progress
+ * into the sg_list. This is done by the cur_seg cursor in the tid_rdma_request
+ * structure.
+ */
+static u32 kern_find_pages(struct tid_rdma_flow *flow,
+ struct page **pages,
+ struct rvt_sge_state *ss, bool *last)
+{
+ struct tid_rdma_request *req = flow->req;
+ struct rvt_sge *sge = &ss->sge;
+ u32 length = flow->req->seg_len;
+ u32 len = PAGE_SIZE;
+ u32 i = 0;
+
+ while (length && req->isge < ss->num_sge) {
+ pages[i++] = virt_to_page(sge->vaddr);
+
+ sge->vaddr += len;
+ sge->length -= len;
+ sge->sge_length -= len;
+ if (!sge->sge_length) {
+ if (++req->isge < ss->num_sge)
+ *sge = ss->sg_list[req->isge - 1];
+ } else if (sge->length == 0 && sge->mr->lkey) {
+ if (++sge->n >= RVT_SEGSZ) {
+ ++sge->m;
+ sge->n = 0;
+ }
+ sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+ sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+ }
+ length -= len;
+ }
+
+ flow->length = flow->req->seg_len - length;
+ *last = req->isge == ss->num_sge ? false : true;
+ return i;
+}
+
+static void dma_unmap_flow(struct tid_rdma_flow *flow)
+{
+ struct hfi1_devdata *dd;
+ int i;
+ struct tid_rdma_pageset *pset;
+
+ dd = flow->req->rcd->dd;
+ for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
+ i++, pset++) {
+ if (pset->count && pset->addr) {
+ dma_unmap_page(&dd->pcidev->dev,
+ pset->addr,
+ PAGE_SIZE * pset->count,
+ DMA_FROM_DEVICE);
+ pset->mapped = 0;
+ }
+ }
+}
+
+static int dma_map_flow(struct tid_rdma_flow *flow, struct page **pages)
+{
+ int i;
+ struct hfi1_devdata *dd = flow->req->rcd->dd;
+ struct tid_rdma_pageset *pset;
+
+ for (i = 0, pset = &flow->pagesets[0]; i < flow->npagesets;
+ i++, pset++) {
+ if (pset->count) {
+ pset->addr = dma_map_page(&dd->pcidev->dev,
+ pages[pset->idx],
+ 0,
+ PAGE_SIZE * pset->count,
+ DMA_FROM_DEVICE);
+
+ if (dma_mapping_error(&dd->pcidev->dev, pset->addr)) {
+ dma_unmap_flow(flow);
+ return -ENOMEM;
+ }
+ pset->mapped = 1;
+ }
+ }
+ return 0;
+}
+
+static inline bool dma_mapped(struct tid_rdma_flow *flow)
+{
+ return !!flow->pagesets[0].mapped;
+}
+
+/*
+ * Get pages pointers and identify contiguous physical memory chunks for a
+ * segment. All segments are of length flow->req->seg_len.
+ */
+static int kern_get_phys_blocks(struct tid_rdma_flow *flow,
+ struct page **pages,
+ struct rvt_sge_state *ss, bool *last)
+{
+ u8 npages;
+
+ /* Reuse previously computed pagesets, if any */
+ if (flow->npagesets) {
+ trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head,
+ flow);
+ if (!dma_mapped(flow))
+ return dma_map_flow(flow, pages);
+ return 0;
+ }
+
+ npages = kern_find_pages(flow, pages, ss, last);
+
+ if (flow->req->qp->pmtu == enum_to_mtu(OPA_MTU_4096))
+ flow->npagesets =
+ tid_rdma_find_phys_blocks_4k(flow, pages, npages,
+ flow->pagesets);
+ else
+ flow->npagesets =
+ tid_rdma_find_phys_blocks_8k(flow, pages, npages,
+ flow->pagesets);
+
+ return dma_map_flow(flow, pages);
+}
+
+static inline void kern_add_tid_node(struct tid_rdma_flow *flow,
+ struct hfi1_ctxtdata *rcd, char *s,
+ struct tid_group *grp, u8 cnt)
+{
+ struct kern_tid_node *node = &flow->tnode[flow->tnode_cnt++];
+
+ WARN_ON_ONCE(flow->tnode_cnt >=
+ (TID_RDMA_MAX_SEGMENT_SIZE >> PAGE_SHIFT));
+ if (WARN_ON_ONCE(cnt & 1))
+ dd_dev_err(rcd->dd,
+ "unexpected odd allocation cnt %u map 0x%x used %u",
+ cnt, grp->map, grp->used);
+
+ node->grp = grp;
+ node->map = grp->map;
+ node->cnt = cnt;
+ trace_hfi1_tid_node_add(flow->req->qp, s, flow->tnode_cnt - 1,
+ grp->base, grp->map, grp->used, cnt);
+}
+
+/*
+ * Try to allocate pageset_count TID's from TID groups for a context
+ *
+ * This function allocates TID's without moving groups between lists or
+ * modifying grp->map. This is done as follows, being cogizant of the lists
+ * between which the TID groups will move:
+ * 1. First allocate complete groups of 8 TID's since this is more efficient,
+ * these groups will move from group->full without affecting used
+ * 2. If more TID's are needed allocate from used (will move from used->full or
+ * stay in used)
+ * 3. If we still don't have the required number of TID's go back and look again
+ * at a complete group (will move from group->used)
+ */
+static int kern_alloc_tids(struct tid_rdma_flow *flow)
+{
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+ u32 ngroups, pageidx = 0;
+ struct tid_group *group = NULL, *used;
+ u8 use;
+
+ flow->tnode_cnt = 0;
+ ngroups = flow->npagesets / dd->rcv_entries.group_size;
+ if (!ngroups)
+ goto used_list;
+
+ /* First look at complete groups */
+ list_for_each_entry(group, &rcd->tid_group_list.list, list) {
+ kern_add_tid_node(flow, rcd, "complete groups", group,
+ group->size);
+
+ pageidx += group->size;
+ if (!--ngroups)
+ break;
+ }
+
+ if (pageidx >= flow->npagesets)
+ goto ok;
+
+used_list:
+ /* Now look at partially used groups */
+ list_for_each_entry(used, &rcd->tid_used_list.list, list) {
+ use = min_t(u32, flow->npagesets - pageidx,
+ used->size - used->used);
+ kern_add_tid_node(flow, rcd, "used groups", used, use);
+
+ pageidx += use;
+ if (pageidx >= flow->npagesets)
+ goto ok;
+ }
+
+ /*
+ * Look again at a complete group, continuing from where we left.
+ * However, if we are at the head, we have reached the end of the
+ * complete groups list from the first loop above
+ */
+ if (group && &group->list == &rcd->tid_group_list.list)
+ goto bail_eagain;
+ group = list_prepare_entry(group, &rcd->tid_group_list.list,
+ list);
+ if (list_is_last(&group->list, &rcd->tid_group_list.list))
+ goto bail_eagain;
+ group = list_next_entry(group, list);
+ use = min_t(u32, flow->npagesets - pageidx, group->size);
+ kern_add_tid_node(flow, rcd, "complete continue", group, use);
+ pageidx += use;
+ if (pageidx >= flow->npagesets)
+ goto ok;
+bail_eagain:
+ trace_hfi1_msg_alloc_tids(flow->req->qp, " insufficient tids: needed ",
+ (u64)flow->npagesets);
+ return -EAGAIN;
+ok:
+ return 0;
+}
+
+static void kern_program_rcv_group(struct tid_rdma_flow *flow, int grp_num,
+ u32 *pset_idx)
+{
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+ struct kern_tid_node *node = &flow->tnode[grp_num];
+ struct tid_group *grp = node->grp;
+ struct tid_rdma_pageset *pset;
+ u32 pmtu_pg = flow->req->qp->pmtu >> PAGE_SHIFT;
+ u32 rcventry, npages = 0, pair = 0, tidctrl;
+ u8 i, cnt = 0;
+
+ for (i = 0; i < grp->size; i++) {
+ rcventry = grp->base + i;
+
+ if (node->map & BIT(i) || cnt >= node->cnt) {
+ rcv_array_wc_fill(dd, rcventry);
+ continue;
+ }
+ pset = &flow->pagesets[(*pset_idx)++];
+ if (pset->count) {
+ hfi1_put_tid(dd, rcventry, PT_EXPECTED,
+ pset->addr, trdma_pset_order(pset));
+ } else {
+ hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
+ }
+ npages += pset->count;
+
+ rcventry -= rcd->expected_base;
+ tidctrl = pair ? 0x3 : rcventry & 0x1 ? 0x2 : 0x1;
+ /*
+ * A single TID entry will be used to use a rcvarr pair (with
+ * tidctrl 0x3), if ALL these are true (a) the bit pos is even
+ * (b) the group map shows current and the next bits as free
+ * indicating two consecutive rcvarry entries are available (c)
+ * we actually need 2 more entries
+ */
+ pair = !(i & 0x1) && !((node->map >> i) & 0x3) &&
+ node->cnt >= cnt + 2;
+ if (!pair) {
+ if (!pset->count)
+ tidctrl = 0x1;
+ flow->tid_entry[flow->tidcnt++] =
+ EXP_TID_SET(IDX, rcventry >> 1) |
+ EXP_TID_SET(CTRL, tidctrl) |
+ EXP_TID_SET(LEN, npages);
+ trace_hfi1_tid_entry_alloc(/* entry */
+ flow->req->qp, flow->tidcnt - 1,
+ flow->tid_entry[flow->tidcnt - 1]);
+
+ /* Efficient DIV_ROUND_UP(npages, pmtu_pg) */
+ flow->npkts += (npages + pmtu_pg - 1) >> ilog2(pmtu_pg);
+ npages = 0;
+ }
+
+ if (grp->used == grp->size - 1)
+ tid_group_move(grp, &rcd->tid_used_list,
+ &rcd->tid_full_list);
+ else if (!grp->used)
+ tid_group_move(grp, &rcd->tid_group_list,
+ &rcd->tid_used_list);
+
+ grp->used++;
+ grp->map |= BIT(i);
+ cnt++;
+ }
+}
+
+static void kern_unprogram_rcv_group(struct tid_rdma_flow *flow, int grp_num)
+{
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+ struct kern_tid_node *node = &flow->tnode[grp_num];
+ struct tid_group *grp = node->grp;
+ u32 rcventry;
+ u8 i, cnt = 0;
+
+ for (i = 0; i < grp->size; i++) {
+ rcventry = grp->base + i;
+
+ if (node->map & BIT(i) || cnt >= node->cnt) {
+ rcv_array_wc_fill(dd, rcventry);
+ continue;
+ }
+
+ hfi1_put_tid(dd, rcventry, PT_INVALID, 0, 0);
+
+ grp->used--;
+ grp->map &= ~BIT(i);
+ cnt++;
+
+ if (grp->used == grp->size - 1)
+ tid_group_move(grp, &rcd->tid_full_list,
+ &rcd->tid_used_list);
+ else if (!grp->used)
+ tid_group_move(grp, &rcd->tid_used_list,
+ &rcd->tid_group_list);
+ }
+ if (WARN_ON_ONCE(cnt & 1)) {
+ struct hfi1_ctxtdata *rcd = flow->req->rcd;
+ struct hfi1_devdata *dd = rcd->dd;
+
+ dd_dev_err(dd, "unexpected odd free cnt %u map 0x%x used %u",
+ cnt, grp->map, grp->used);
+ }
+}
+
+static void kern_program_rcvarray(struct tid_rdma_flow *flow)
+{
+ u32 pset_idx = 0;
+ int i;
+
+ flow->npkts = 0;
+ flow->tidcnt = 0;
+ for (i = 0; i < flow->tnode_cnt; i++)
+ kern_program_rcv_group(flow, i, &pset_idx);
+ trace_hfi1_tid_flow_alloc(flow->req->qp, flow->req->setup_head, flow);
+}
+
+/**
+ * hfi1_kern_exp_rcv_setup() - setup TID's and flow for one segment of a
+ * TID RDMA request
+ *
+ * @req: TID RDMA request for which the segment/flow is being set up
+ * @ss: sge state, maintains state across successive segments of a sge
+ * @last: set to true after the last sge segment has been processed
+ *
+ * This function
+ * (1) finds a free flow entry in the flow circular buffer
+ * (2) finds pages and continuous physical chunks constituing one segment
+ * of an sge
+ * (3) allocates TID group entries for those chunks
+ * (4) programs rcvarray entries in the hardware corresponding to those
+ * TID's
+ * (5) computes a tidarray with formatted TID entries which can be sent
+ * to the sender
+ * (6) Reserves and programs HW flows.
+ * (7) It also manages queing the QP when TID/flow resources are not
+ * available.
+ *
+ * @req points to struct tid_rdma_request of which the segments are a part. The
+ * function uses qp, rcd and seg_len members of @req. In the absence of errors,
+ * req->flow_idx is the index of the flow which has been prepared in this
+ * invocation of function call. With flow = &req->flows[req->flow_idx],
+ * flow->tid_entry contains the TID array which the sender can use for TID RDMA
+ * sends and flow->npkts contains number of packets required to send the
+ * segment.
+ *
+ * hfi1_check_sge_align should be called prior to calling this function and if
+ * it signals error TID RDMA cannot be used for this sge and this function
+ * should not be called.
+ *
+ * For the queuing, caller must hold the flow->req->qp s_lock from the send
+ * engine and the function will procure the exp_lock.
+ *
+ * Return:
+ * The function returns -EAGAIN if sufficient number of TID/flow resources to
+ * map the segment could not be allocated. In this case the function should be
+ * called again with previous arguments to retry the TID allocation. There are
+ * no other error returns. The function returns 0 on success.
+ */
+int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
+ struct rvt_sge_state *ss, bool *last)
+ __must_hold(&req->qp->s_lock)
+{
+ struct tid_rdma_flow *flow = &req->flows[req->setup_head];
+ struct hfi1_ctxtdata *rcd = req->rcd;
+ struct hfi1_qp_priv *qpriv = req->qp->priv;
+ unsigned long flags;
+ struct rvt_qp *fqp;
+ u16 clear_tail = req->clear_tail;
+
+ lockdep_assert_held(&req->qp->s_lock);
+ /*
+ * We return error if either (a) we don't have space in the flow
+ * circular buffer, or (b) we already have max entries in the buffer.
+ * Max entries depend on the type of request we are processing and the
+ * negotiated TID RDMA parameters.
+ */
+ if (!CIRC_SPACE(req->setup_head, clear_tail, MAX_FLOWS) ||
+ CIRC_CNT(req->setup_head, clear_tail, MAX_FLOWS) >=
+ req->n_flows)
+ return -EINVAL;
+
+ /*
+ * Get pages, identify contiguous physical memory chunks for the segment
+ * If we can not determine a DMA address mapping we will treat it just
+ * like if we ran out of space above.
+ */
+ if (kern_get_phys_blocks(flow, qpriv->pages, ss, last)) {
+ hfi1_wait_kmem(flow->req->qp);
+ return -ENOMEM;
+ }
+
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+ if (kernel_tid_waiters(rcd, &rcd->rarr_queue, flow->req->qp))
+ goto queue;
+
+ /*
+ * At this point we know the number of pagesets and hence the number of
+ * TID's to map the segment. Allocate the TID's from the TID groups. If
+ * we cannot allocate the required number we exit and try again later
+ */
+ if (kern_alloc_tids(flow))
+ goto queue;
+ /*
+ * Finally program the TID entries with the pagesets, compute the
+ * tidarray and enable the HW flow
+ */
+ kern_program_rcvarray(flow);
+
+ /*
+ * Setup the flow state with relevant information.
+ * This information is used for tracking the sequence of data packets
+ * for the segment.
+ * The flow is setup here as this is the most accurate time and place
+ * to do so. Doing at a later time runs the risk of the flow data in
+ * qpriv getting out of sync.
+ */
+ memset(&flow->flow_state, 0x0, sizeof(flow->flow_state));
+ flow->idx = qpriv->flow_state.index;
+ flow->flow_state.generation = qpriv->flow_state.generation;
+ flow->flow_state.spsn = qpriv->flow_state.psn;
+ flow->flow_state.lpsn = flow->flow_state.spsn + flow->npkts - 1;
+ flow->flow_state.r_next_psn =
+ full_flow_psn(flow, flow->flow_state.spsn);
+ qpriv->flow_state.psn += flow->npkts;
+
+ dequeue_tid_waiter(rcd, &rcd->rarr_queue, flow->req->qp);
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->rarr_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+ tid_rdma_schedule_tid_wakeup(fqp);
+
+ req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
+ return 0;
+queue:
+ queue_qp_for_tid_wait(rcd, &rcd->rarr_queue, flow->req->qp);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+ return -EAGAIN;
+}
+
+static void hfi1_tid_rdma_reset_flow(struct tid_rdma_flow *flow)
+{
+ flow->npagesets = 0;
+}
+
+/*
+ * This function is called after one segment has been successfully sent to
+ * release the flow and TID HW/SW resources for that segment. The segments for a
+ * TID RDMA request are setup and cleared in FIFO order which is managed using a
+ * circular buffer.
+ */
+int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req)
+ __must_hold(&req->qp->s_lock)
+{
+ struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+ struct hfi1_ctxtdata *rcd = req->rcd;
+ unsigned long flags;
+ int i;
+ struct rvt_qp *fqp;
+
+ lockdep_assert_held(&req->qp->s_lock);
+ /* Exit if we have nothing in the flow circular buffer */
+ if (!CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS))
+ return -EINVAL;
+
+ spin_lock_irqsave(&rcd->exp_lock, flags);
+
+ for (i = 0; i < flow->tnode_cnt; i++)
+ kern_unprogram_rcv_group(flow, i);
+ /* To prevent double unprogramming */
+ flow->tnode_cnt = 0;
+ /* get head before dropping lock */
+ fqp = first_qp(rcd, &rcd->rarr_queue);
+ spin_unlock_irqrestore(&rcd->exp_lock, flags);
+
+ dma_unmap_flow(flow);
+
+ hfi1_tid_rdma_reset_flow(flow);
+ req->clear_tail = (req->clear_tail + 1) & (MAX_FLOWS - 1);
+
+ if (fqp == req->qp) {
+ __trigger_tid_waiter(fqp);
+ rvt_put_qp(fqp);
+ } else {
+ tid_rdma_schedule_tid_wakeup(fqp);
+ }
+
+ return 0;
+}
+
+/*
+ * This function is called to release all the tid entries for
+ * a request.
+ */
+void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req)
+ __must_hold(&req->qp->s_lock)
+{
+ /* Use memory barrier for proper ordering */
+ while (CIRC_CNT(req->setup_head, req->clear_tail, MAX_FLOWS)) {
+ if (hfi1_kern_exp_rcv_clear(req))
+ break;
+ }
+}
+
+/**
+ * hfi1_kern_exp_rcv_free_flows - free priviously allocated flow information
+ * @req - the tid rdma request to be cleaned
+ */
+static void hfi1_kern_exp_rcv_free_flows(struct tid_rdma_request *req)
+{
+ kfree(req->flows);
+ req->flows = NULL;
+}
+
+/**
+ * __trdma_clean_swqe - clean up for large sized QPs
+ * @qp: the queue patch
+ * @wqe: the send wqe
+ */
+void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct hfi1_swqe_priv *p = wqe->priv;
+
+ hfi1_kern_exp_rcv_free_flows(&p->tid_req);
+}
+
+/*
+ * This can be called at QP create time or in the data path.
+ */
+static int hfi1_kern_exp_rcv_alloc_flows(struct tid_rdma_request *req,
+ gfp_t gfp)
+{
+ struct tid_rdma_flow *flows;
+ int i;
+
+ if (likely(req->flows))
+ return 0;
+ flows = kmalloc_node(MAX_FLOWS * sizeof(*flows), gfp,
+ req->rcd->numa_id);
+ if (!flows)
+ return -ENOMEM;
+ /* mini init */
+ for (i = 0; i < MAX_FLOWS; i++) {
+ flows[i].req = req;
+ flows[i].npagesets = 0;
+ flows[i].pagesets[0].mapped = 0;
+ flows[i].resync_npkts = 0;
+ }
+ req->flows = flows;
+ return 0;
+}
+
+static void hfi1_init_trdma_req(struct rvt_qp *qp,
+ struct tid_rdma_request *req)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ /*
+ * Initialize various TID RDMA request variables.
+ * These variables are "static", which is why they
+ * can be pre-initialized here before the WRs has
+ * even been submitted.
+ * However, non-NULL values for these variables do not
+ * imply that this WQE has been enabled for TID RDMA.
+ * Drivers should check the WQE's opcode to determine
+ * if a request is a TID RDMA one or not.
+ */
+ req->qp = qp;
+ req->rcd = qpriv->rcd;
+}
+
+u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data)
+{
+ struct hfi1_devdata *dd = context;
+
+ return dd->verbs_dev.n_tidwait;
+}
+
+static struct tid_rdma_flow *find_flow_ib(struct tid_rdma_request *req,
+ u32 psn, u16 *fidx)
+{
+ u16 head, tail;
+ struct tid_rdma_flow *flow;
+
+ head = req->setup_head;
+ tail = req->clear_tail;
+ for ( ; CIRC_CNT(head, tail, MAX_FLOWS);
+ tail = CIRC_NEXT(tail, MAX_FLOWS)) {
+ flow = &req->flows[tail];
+ if (cmp_psn(psn, flow->flow_state.ib_spsn) >= 0 &&
+ cmp_psn(psn, flow->flow_state.ib_lpsn) <= 0) {
+ if (fidx)
+ *fidx = tail;
+ return flow;
+ }
+ }
+ return NULL;
+}
+
+/* TID RDMA READ functions */
+u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u32 *len)
+{
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = &req->flows[req->flow_idx];
+ struct rvt_qp *qp = req->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_swqe_priv *wpriv = wqe->priv;
+ struct tid_rdma_read_req *rreq = &ohdr->u.tid_rdma.r_req;
+ struct tid_rdma_params *remote;
+ u32 req_len = 0;
+ void *req_addr = NULL;
+
+ /* This is the IB psn used to send the request */
+ *bth2 = mask_psn(flow->flow_state.ib_spsn + flow->pkt);
+ trace_hfi1_tid_flow_build_read_pkt(qp, req->flow_idx, flow);
+
+ /* TID Entries for TID RDMA READ payload */
+ req_addr = &flow->tid_entry[flow->tid_idx];
+ req_len = sizeof(*flow->tid_entry) *
+ (flow->tidcnt - flow->tid_idx);
+
+ memset(&ohdr->u.tid_rdma.r_req, 0, sizeof(ohdr->u.tid_rdma.r_req));
+ wpriv->ss.sge.vaddr = req_addr;
+ wpriv->ss.sge.sge_length = req_len;
+ wpriv->ss.sge.length = wpriv->ss.sge.sge_length;
+ /*
+ * We can safely zero these out. Since the first SGE covers the
+ * entire packet, nothing else should even look at the MR.
+ */
+ wpriv->ss.sge.mr = NULL;
+ wpriv->ss.sge.m = 0;
+ wpriv->ss.sge.n = 0;
+
+ wpriv->ss.sg_list = NULL;
+ wpriv->ss.total_len = wpriv->ss.sge.sge_length;
+ wpriv->ss.num_sge = 1;
+
+ /* Construct the TID RDMA READ REQ packet header */
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+
+ KDETH_RESET(rreq->kdeth0, KVER, 0x1);
+ KDETH_RESET(rreq->kdeth1, JKEY, remote->jkey);
+ rreq->reth.vaddr = cpu_to_be64(wqe->rdma_wr.remote_addr +
+ req->cur_seg * req->seg_len + flow->sent);
+ rreq->reth.rkey = cpu_to_be32(wqe->rdma_wr.rkey);
+ rreq->reth.length = cpu_to_be32(*len);
+ rreq->tid_flow_psn =
+ cpu_to_be32((flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT) |
+ ((flow->flow_state.spsn + flow->pkt) &
+ HFI1_KDETH_BTH_SEQ_MASK));
+ rreq->tid_flow_qp =
+ cpu_to_be32(qpriv->tid_rdma.local.qp |
+ ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+ TID_RDMA_DESTQP_FLOW_SHIFT) |
+ qpriv->rcd->ctxt);
+ rreq->verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 &= ~RVT_QPN_MASK;
+ *bth1 |= remote->qp;
+ *bth2 |= IB_BTH_REQ_ACK;
+ rcu_read_unlock();
+
+ /* We are done with this segment */
+ flow->sent += *len;
+ req->cur_seg++;
+ qp->s_state = TID_OP(READ_REQ);
+ req->ack_pending++;
+ req->flow_idx = (req->flow_idx + 1) & (MAX_FLOWS - 1);
+ qpriv->pending_tid_r_segs++;
+ qp->s_num_rd_atomic++;
+
+ /* Set the TID RDMA READ request payload size */
+ *len = req_len;
+
+ return sizeof(ohdr->u.tid_rdma.r_req) / sizeof(u32);
+}
+
+/*
+ * @len: contains the data length to read upon entry and the read request
+ * payload length upon exit.
+ */
+u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u32 *len)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = NULL;
+ u32 hdwords = 0;
+ bool last;
+ bool retry = true;
+ u32 npkts = rvt_div_round_up_mtu(qp, *len);
+
+ trace_hfi1_tid_req_build_read_req(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ /*
+ * Check sync conditions. Make sure that there are no pending
+ * segments before freeing the flow.
+ */
+sync_check:
+ if (req->state == TID_REQUEST_SYNC) {
+ if (qpriv->pending_tid_r_segs)
+ goto done;
+
+ hfi1_kern_clear_hw_flow(req->rcd, qp);
+ qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+ req->state = TID_REQUEST_ACTIVE;
+ }
+
+ /*
+ * If the request for this segment is resent, the tid resources should
+ * have been allocated before. In this case, req->flow_idx should
+ * fall behind req->setup_head.
+ */
+ if (req->flow_idx == req->setup_head) {
+ retry = false;
+ if (req->state == TID_REQUEST_RESEND) {
+ /*
+ * This is the first new segment for a request whose
+ * earlier segments have been re-sent. We need to
+ * set up the sge pointer correctly.
+ */
+ restart_sge(&qp->s_sge, wqe, req->s_next_psn,
+ qp->pmtu);
+ req->isge = 0;
+ req->state = TID_REQUEST_ACTIVE;
+ }
+
+ /*
+ * Check sync. The last PSN of each generation is reserved for
+ * RESYNC.
+ */
+ if ((qpriv->flow_state.psn + npkts) > MAX_TID_FLOW_PSN - 1) {
+ req->state = TID_REQUEST_SYNC;
+ goto sync_check;
+ }
+
+ /* Allocate the flow if not yet */
+ if (hfi1_kern_setup_hw_flow(qpriv->rcd, qp))
+ goto done;
+
+ /*
+ * The following call will advance req->setup_head after
+ * allocating the tid entries.
+ */
+ if (hfi1_kern_exp_rcv_setup(req, &qp->s_sge, &last)) {
+ req->state = TID_REQUEST_QUEUED;
+
+ /*
+ * We don't have resources for this segment. The QP has
+ * already been queued.
+ */
+ goto done;
+ }
+ }
+
+ /* req->flow_idx should only be one slot behind req->setup_head */
+ flow = &req->flows[req->flow_idx];
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->sent = 0;
+ if (!retry) {
+ /* Set the first and last IB PSN for the flow in use.*/
+ flow->flow_state.ib_spsn = req->s_next_psn;
+ flow->flow_state.ib_lpsn =
+ flow->flow_state.ib_spsn + flow->npkts - 1;
+ }
+
+ /* Calculate the next segment start psn.*/
+ req->s_next_psn += flow->npkts;
+
+ /* Build the packet header */
+ hdwords = hfi1_build_tid_rdma_read_packet(wqe, ohdr, bth1, bth2, len);
+done:
+ return hdwords;
+}
+
+/*
+ * Validate and accept the TID RDMA READ request parameters.
+ * Return 0 if the request is accepted successfully;
+ * Return 1 otherwise.
+ */
+static int tid_rdma_rcv_read_request(struct rvt_qp *qp,
+ struct rvt_ack_entry *e,
+ struct hfi1_packet *packet,
+ struct ib_other_headers *ohdr,
+ u32 bth0, u32 psn, u64 vaddr, u32 len)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 flow_psn, i, tidlen = 0, pktlen, tlen;
+
+ req = ack_to_tid_req(e);
+
+ /* Validate the payload first */
+ flow = &req->flows[req->setup_head];
+
+ /* payload length = packet length - (header length + ICRC length) */
+ pktlen = packet->tlen - (packet->hlen + 4);
+ if (pktlen > sizeof(flow->tid_entry))
+ return 1;
+ memcpy(flow->tid_entry, packet->ebuf, pktlen);
+ flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
+
+ /*
+ * Walk the TID_ENTRY list to make sure we have enough space for a
+ * complete segment. Also calculate the number of required packets.
+ */
+ flow->npkts = rvt_div_round_up_mtu(qp, len);
+ for (i = 0; i < flow->tidcnt; i++) {
+ trace_hfi1_tid_entry_rcv_read_req(qp, i,
+ flow->tid_entry[i]);
+ tlen = EXP_TID_GET(flow->tid_entry[i], LEN);
+ if (!tlen)
+ return 1;
+
+ /*
+ * For tid pair (tidctr == 3), the buffer size of the pair
+ * should be the sum of the buffer size described by each
+ * tid entry. However, only the first entry needs to be
+ * specified in the request (see WFR HAS Section 8.5.7.1).
+ */
+ tidlen += tlen;
+ }
+ if (tidlen * PAGE_SIZE < len)
+ return 1;
+
+ /* Empty the flow array */
+ req->clear_tail = req->setup_head;
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->tid_offset = 0;
+ flow->sent = 0;
+ flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_qp);
+ flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
+ TID_RDMA_DESTQP_FLOW_MASK;
+ flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_req.tid_flow_psn));
+ flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+ flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
+ flow->length = len;
+
+ flow->flow_state.lpsn = flow->flow_state.spsn +
+ flow->npkts - 1;
+ flow->flow_state.ib_spsn = psn;
+ flow->flow_state.ib_lpsn = flow->flow_state.ib_spsn + flow->npkts - 1;
+
+ trace_hfi1_tid_flow_rcv_read_req(qp, req->setup_head, flow);
+ /* Set the initial flow index to the current flow. */
+ req->flow_idx = req->setup_head;
+
+ /* advance circular buffer head */
+ req->setup_head = (req->setup_head + 1) & (MAX_FLOWS - 1);
+
+ /*
+ * Compute last PSN for request.
+ */
+ e->opcode = (bth0 >> 24) & 0xff;
+ e->psn = psn;
+ e->lpsn = psn + flow->npkts - 1;
+ e->sent = 0;
+
+ req->n_flows = qpriv->tid_rdma.local.max_read;
+ req->state = TID_REQUEST_ACTIVE;
+ req->cur_seg = 0;
+ req->comp_seg = 0;
+ req->ack_seg = 0;
+ req->isge = 0;
+ req->seg_len = qpriv->tid_rdma.local.max_len;
+ req->total_len = len;
+ req->total_segs = 1;
+ req->r_flow_psn = e->psn;
+
+ trace_hfi1_tid_req_rcv_read_req(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ return 0;
+}
+
+static int tid_rdma_rcv_error(struct hfi1_packet *packet,
+ struct ib_other_headers *ohdr,
+ struct rvt_qp *qp, u32 psn, int diff)
+{
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_ctxtdata *rcd = ((struct hfi1_qp_priv *)qp->priv)->rcd;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ unsigned long flags;
+ u8 prev;
+ bool old_req;
+
+ trace_hfi1_rsp_tid_rcv_error(qp, psn);
+ trace_hfi1_tid_rdma_rcv_err(qp, 0, psn, diff);
+ if (diff > 0) {
+ /* sequence error */
+ if (!qp->r_nak_state) {
+ ibp->rvp.n_rc_seqnak++;
+ qp->r_nak_state = IB_NAK_PSN_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+ rc_defered_ack(rcd, qp);
+ }
+ goto done;
+ }
+
+ ibp->rvp.n_rc_dupreq++;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ e = find_prev_entry(qp, psn, &prev, NULL, &old_req);
+ if (!e || (e->opcode != TID_OP(READ_REQ) &&
+ e->opcode != TID_OP(WRITE_REQ)))
+ goto unlock;
+
+ req = ack_to_tid_req(e);
+ req->r_flow_psn = psn;
+ trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn, e->lpsn, req);
+ if (e->opcode == TID_OP(READ_REQ)) {
+ struct ib_reth *reth;
+ u32 len;
+ u32 rkey;
+ u64 vaddr;
+ int ok;
+ u32 bth0;
+
+ reth = &ohdr->u.tid_rdma.r_req.reth;
+ /*
+ * The requester always restarts from the start of the original
+ * request.
+ */
+ len = be32_to_cpu(reth->length);
+ if (psn != e->psn || len != req->total_len)
+ goto unlock;
+
+ release_rdma_sge_mr(e);
+
+ rkey = be32_to_cpu(reth->rkey);
+ vaddr = get_ib_reth_vaddr(reth);
+
+ qp->r_len = len;
+ ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+ IB_ACCESS_REMOTE_READ);
+ if (unlikely(!ok))
+ goto unlock;
+
+ /*
+ * If all the response packets for the current request have
+ * been sent out and this request is complete (old_request
+ * == false) and the TID flow may be unusable (the
+ * req->clear_tail is advanced). However, when an earlier
+ * request is received, this request will not be complete any
+ * more (qp->s_tail_ack_queue is moved back, see below).
+ * Consequently, we need to update the TID flow info everytime
+ * a duplicate request is received.
+ */
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn,
+ vaddr, len))
+ goto unlock;
+
+ /*
+ * True if the request is already scheduled (between
+ * qp->s_tail_ack_queue and qp->r_head_ack_queue);
+ */
+ if (old_req)
+ goto unlock;
+ } else {
+ struct flow_state *fstate;
+ bool schedule = false;
+ u8 i;
+
+ if (req->state == TID_REQUEST_RESEND) {
+ req->state = TID_REQUEST_RESEND_ACTIVE;
+ } else if (req->state == TID_REQUEST_INIT_RESEND) {
+ req->state = TID_REQUEST_INIT;
+ schedule = true;
+ }
+
+ /*
+ * True if the request is already scheduled (between
+ * qp->s_tail_ack_queue and qp->r_head_ack_queue).
+ * Also, don't change requests, which are at the SYNC
+ * point and haven't generated any responses yet.
+ * There is nothing to retransmit for them yet.
+ */
+ if (old_req || req->state == TID_REQUEST_INIT ||
+ (req->state == TID_REQUEST_SYNC && !req->cur_seg)) {
+ for (i = prev + 1; ; i++) {
+ if (i > rvt_size_atomic(&dev->rdi))
+ i = 0;
+ if (i == qp->r_head_ack_queue)
+ break;
+ e = &qp->s_ack_queue[i];
+ req = ack_to_tid_req(e);
+ if (e->opcode == TID_OP(WRITE_REQ) &&
+ req->state == TID_REQUEST_INIT)
+ req->state = TID_REQUEST_INIT_RESEND;
+ }
+ /*
+ * If the state of the request has been changed,
+ * the first leg needs to get scheduled in order to
+ * pick up the change. Otherwise, normal response
+ * processing should take care of it.
+ */
+ if (!schedule)
+ goto unlock;
+ }
+
+ /*
+ * If there is no more allocated segment, just schedule the qp
+ * without changing any state.
+ */
+ if (req->clear_tail == req->setup_head)
+ goto schedule;
+ /*
+ * If this request has sent responses for segments, which have
+ * not received data yet (flow_idx != clear_tail), the flow_idx
+ * pointer needs to be adjusted so the same responses can be
+ * re-sent.
+ */
+ if (CIRC_CNT(req->flow_idx, req->clear_tail, MAX_FLOWS)) {
+ fstate = &req->flows[req->clear_tail].flow_state;
+ qpriv->pending_tid_w_segs -=
+ CIRC_CNT(req->flow_idx, req->clear_tail,
+ MAX_FLOWS);
+ req->flow_idx =
+ CIRC_ADD(req->clear_tail,
+ delta_psn(psn, fstate->resp_ib_psn),
+ MAX_FLOWS);
+ qpriv->pending_tid_w_segs +=
+ delta_psn(psn, fstate->resp_ib_psn);
+ /*
+ * When flow_idx == setup_head, we've gotten a duplicate
+ * request for a segment, which has not been allocated
+ * yet. In that case, don't adjust this request.
+ * However, we still want to go through the loop below
+ * to adjust all subsequent requests.
+ */
+ if (CIRC_CNT(req->setup_head, req->flow_idx,
+ MAX_FLOWS)) {
+ req->cur_seg = delta_psn(psn, e->psn);
+ req->state = TID_REQUEST_RESEND_ACTIVE;
+ }
+ }
+
+ for (i = prev + 1; ; i++) {
+ /*
+ * Look at everything up to and including
+ * s_tail_ack_queue
+ */
+ if (i > rvt_size_atomic(&dev->rdi))
+ i = 0;
+ if (i == qp->r_head_ack_queue)
+ break;
+ e = &qp->s_ack_queue[i];
+ req = ack_to_tid_req(e);
+ trace_hfi1_tid_req_rcv_err(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ if (e->opcode != TID_OP(WRITE_REQ) ||
+ req->cur_seg == req->comp_seg ||
+ req->state == TID_REQUEST_INIT ||
+ req->state == TID_REQUEST_INIT_RESEND) {
+ if (req->state == TID_REQUEST_INIT)
+ req->state = TID_REQUEST_INIT_RESEND;
+ continue;
+ }
+ qpriv->pending_tid_w_segs -=
+ CIRC_CNT(req->flow_idx,
+ req->clear_tail,
+ MAX_FLOWS);
+ req->flow_idx = req->clear_tail;
+ req->state = TID_REQUEST_RESEND;
+ req->cur_seg = req->comp_seg;
+ }
+ qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
+ }
+ /* Re-process old requests.*/
+ if (qp->s_acked_ack_queue == qp->s_tail_ack_queue)
+ qp->s_acked_ack_queue = prev;
+ qp->s_tail_ack_queue = prev;
+ /*
+ * Since the qp->s_tail_ack_queue is modified, the
+ * qp->s_ack_state must be changed to re-initialize
+ * qp->s_ack_rdma_sge; Otherwise, we will end up in
+ * wrong memory region.
+ */
+ qp->s_ack_state = OP(ACKNOWLEDGE);
+schedule:
+ /*
+ * It's possible to receive a retry psn that is earlier than an RNRNAK
+ * psn. In this case, the rnrnak state should be cleared.
+ */
+ if (qpriv->rnr_nak_state) {
+ qp->s_nak_state = 0;
+ qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+ qp->r_psn = e->lpsn + 1;
+ hfi1_tid_write_alloc_resources(qp, true);
+ }
+
+ qp->r_state = e->opcode;
+ qp->r_nak_state = 0;
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+ return 1;
+}
+
+void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
+{
+ /* HANDLER FOR TID RDMA READ REQUEST packet (Responder side)*/
+
+ /*
+ * 1. Verify TID RDMA READ REQ as per IB_OPCODE_RC_RDMA_READ
+ * (see hfi1_rc_rcv())
+ * 2. Put TID RDMA READ REQ into the response queueu (s_ack_queue)
+ * - Setup struct tid_rdma_req with request info
+ * - Initialize struct tid_rdma_flow info;
+ * - Copy TID entries;
+ * 3. Set the qp->s_ack_state.
+ * 4. Set RVT_S_RESP_PENDING in s_flags.
+ * 5. Kick the send engine (hfi1_schedule_send())
+ */
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_ack_entry *e;
+ unsigned long flags;
+ struct ib_reth *reth;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ u32 bth0, psn, len, rkey;
+ bool fecn;
+ u8 next;
+ u64 vaddr;
+ int diff;
+ u8 nack_state = IB_NAK_INVALID_REQUEST;
+
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (hfi1_ruc_check_hdr(ibp, packet))
+ return;
+
+ fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ trace_hfi1_rsp_rcv_tid_read_req(qp, psn);
+
+ if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+ rvt_comm_est(qp);
+
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+ goto nack_inv;
+
+ reth = &ohdr->u.tid_rdma.r_req.reth;
+ vaddr = be64_to_cpu(reth->vaddr);
+ len = be32_to_cpu(reth->length);
+ /* The length needs to be in multiples of PAGE_SIZE */
+ if (!len || len & ~PAGE_MASK || len > qpriv->tid_rdma.local.max_len)
+ goto nack_inv;
+
+ diff = delta_psn(psn, qp->r_psn);
+ if (unlikely(diff)) {
+ tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
+ return;
+ }
+
+ /* We've verified the request, insert it into the ack queue. */
+ next = qp->r_head_ack_queue + 1;
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ next = 0;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (unlikely(next == qp->s_tail_ack_queue)) {
+ if (!qp->s_ack_queue[next].sent) {
+ nack_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+ goto nack_inv_unlock;
+ }
+ update_ack_queue(qp, next);
+ }
+ e = &qp->s_ack_queue[qp->r_head_ack_queue];
+ release_rdma_sge_mr(e);
+
+ rkey = be32_to_cpu(reth->rkey);
+ qp->r_len = len;
+
+ if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
+ rkey, IB_ACCESS_REMOTE_READ)))
+ goto nack_acc;
+
+ /* Accept the request parameters */
+ if (tid_rdma_rcv_read_request(qp, e, packet, ohdr, bth0, psn, vaddr,
+ len))
+ goto nack_inv_unlock;
+
+ qp->r_state = e->opcode;
+ qp->r_nak_state = 0;
+ /*
+ * We need to increment the MSN here instead of when we
+ * finish sending the result since a duplicate request would
+ * increment it more than once.
+ */
+ qp->r_msn++;
+ qp->r_psn += e->lpsn - e->psn + 1;
+
+ qp->r_head_ack_queue = next;
+
+ /*
+ * For all requests other than TID WRITE which are added to the ack
+ * queue, qpriv->r_tid_alloc follows qp->r_head_ack_queue. It is ok to
+ * do this because of interlocks between these and TID WRITE
+ * requests. The same change has also been made in hfi1_rc_rcv().
+ */
+ qpriv->r_tid_alloc = qp->r_head_ack_queue;
+
+ /* Schedule the send tasklet. */
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ if (fecn)
+ qp->s_flags |= RVT_S_ECN;
+ hfi1_schedule_send(qp);
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return;
+
+nack_inv_unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+ rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ qp->r_nak_state = nack_state;
+ qp->r_ack_psn = qp->r_psn;
+ /* Queue NAK for later */
+ rc_defered_ack(rcd, qp);
+ return;
+nack_acc:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
+ qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+}
+
+u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u32 *bth0,
+ u32 *bth1, u32 *bth2, u32 *len, bool *last)
+{
+ struct hfi1_ack_priv *epriv = e->priv;
+ struct tid_rdma_request *req = &epriv->tid_req;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+ u32 tidentry = flow->tid_entry[flow->tid_idx];
+ u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
+ struct tid_rdma_read_resp *resp = &ohdr->u.tid_rdma.r_rsp;
+ u32 next_offset, om = KDETH_OM_LARGE;
+ bool last_pkt;
+ u32 hdwords = 0;
+ struct tid_rdma_params *remote;
+
+ *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
+ flow->sent += *len;
+ next_offset = flow->tid_offset + *len;
+ last_pkt = (flow->sent >= flow->length);
+
+ trace_hfi1_tid_entry_build_read_resp(qp, flow->tid_idx, tidentry);
+ trace_hfi1_tid_flow_build_read_resp(qp, req->clear_tail, flow);
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ if (!remote) {
+ rcu_read_unlock();
+ goto done;
+ }
+ KDETH_RESET(resp->kdeth0, KVER, 0x1);
+ KDETH_SET(resp->kdeth0, SH, !last_pkt);
+ KDETH_SET(resp->kdeth0, INTR, !!(!last_pkt && remote->urg));
+ KDETH_SET(resp->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
+ KDETH_SET(resp->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
+ KDETH_SET(resp->kdeth0, OM, om == KDETH_OM_LARGE);
+ KDETH_SET(resp->kdeth0, OFFSET, flow->tid_offset / om);
+ KDETH_RESET(resp->kdeth1, JKEY, remote->jkey);
+ resp->verbs_qp = cpu_to_be32(qp->remote_qpn);
+ rcu_read_unlock();
+
+ resp->aeth = rvt_compute_aeth(qp);
+ resp->verbs_psn = cpu_to_be32(mask_psn(flow->flow_state.ib_spsn +
+ flow->pkt));
+
+ *bth0 = TID_OP(READ_RESP) << 24;
+ *bth1 = flow->tid_qpn;
+ *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
+ HFI1_KDETH_BTH_SEQ_MASK) |
+ (flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT));
+ *last = last_pkt;
+ if (last_pkt)
+ /* Advance to next flow */
+ req->clear_tail = (req->clear_tail + 1) &
+ (MAX_FLOWS - 1);
+
+ if (next_offset >= tidlen) {
+ flow->tid_offset = 0;
+ flow->tid_idx++;
+ } else {
+ flow->tid_offset = next_offset;
+ }
+
+ hdwords = sizeof(ohdr->u.tid_rdma.r_rsp) / sizeof(u32);
+
+done:
+ return hdwords;
+}
+
+static inline struct tid_rdma_request *
+find_tid_request(struct rvt_qp *qp, u32 psn, enum ib_wr_opcode opcode)
+ __must_hold(&qp->s_lock)
+{
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req = NULL;
+ u32 i, end;
+
+ end = qp->s_cur + 1;
+ if (end == qp->s_size)
+ end = 0;
+ for (i = qp->s_acked; i != end;) {
+ wqe = rvt_get_swqe_ptr(qp, i);
+ if (cmp_psn(psn, wqe->psn) >= 0 &&
+ cmp_psn(psn, wqe->lpsn) <= 0) {
+ if (wqe->wr.opcode == opcode)
+ req = wqe_to_tid_req(wqe);
+ break;
+ }
+ if (++i == qp->s_size)
+ i = 0;
+ }
+
+ return req;
+}
+
+void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
+{
+ /* HANDLER FOR TID RDMA READ RESPONSE packet (Requestor side */
+
+ /*
+ * 1. Find matching SWQE
+ * 2. Check that the entire segment has been read.
+ * 3. Remove HFI1_S_WAIT_TID_RESP from s_flags.
+ * 4. Free the TID flow resources.
+ * 5. Kick the send engine (hfi1_schedule_send())
+ */
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 opcode, aeth;
+ bool fecn;
+ unsigned long flags;
+ u32 kpsn, ipsn;
+
+ trace_hfi1_sender_rcv_tid_read_resp(qp);
+ fecn = process_ecn(qp, packet);
+ kpsn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ ipsn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn));
+ req = find_tid_request(qp, ipsn, IB_WR_TID_RDMA_READ);
+ if (unlikely(!req))
+ goto ack_op_err;
+
+ flow = &req->flows[req->clear_tail];
+ /* When header suppression is disabled */
+ if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) {
+ update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
+
+ if (cmp_psn(kpsn, flow->flow_state.r_next_psn))
+ goto ack_done;
+ flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
+ /*
+ * Copy the payload to destination buffer if this packet is
+ * delivered as an eager packet due to RSM rule and FECN.
+ * The RSM rule selects FECN bit in BTH and SH bit in
+ * KDETH header and therefore will not match the last
+ * packet of each segment that has SH bit cleared.
+ */
+ if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
+ struct rvt_sge_state ss;
+ u32 len;
+ u32 tlen = packet->tlen;
+ u16 hdrsize = packet->hlen;
+ u8 pad = packet->pad;
+ u8 extra_bytes = pad + packet->extra_byte +
+ (SIZE_OF_CRC << 2);
+ u32 pmtu = qp->pmtu;
+
+ if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
+ goto ack_op_err;
+ len = restart_sge(&ss, req->e.swqe, ipsn, pmtu);
+ if (unlikely(len < pmtu))
+ goto ack_op_err;
+ rvt_copy_sge(qp, &ss, packet->payload, pmtu, false,
+ false);
+ /* Raise the sw sequence check flag for next packet */
+ priv->s_flags |= HFI1_R_TID_SW_PSN;
+ }
+
+ goto ack_done;
+ }
+ flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
+ req->ack_pending--;
+ priv->pending_tid_r_segs--;
+ qp->s_num_rd_atomic--;
+ if ((qp->s_flags & RVT_S_WAIT_FENCE) &&
+ !qp->s_num_rd_atomic) {
+ qp->s_flags &= ~(RVT_S_WAIT_FENCE |
+ RVT_S_WAIT_ACK);
+ hfi1_schedule_send(qp);
+ }
+ if (qp->s_flags & RVT_S_WAIT_RDMAR) {
+ qp->s_flags &= ~(RVT_S_WAIT_RDMAR | RVT_S_WAIT_ACK);
+ hfi1_schedule_send(qp);
+ }
+
+ trace_hfi1_ack(qp, ipsn);
+ trace_hfi1_tid_req_rcv_read_resp(qp, 0, req->e.swqe->wr.opcode,
+ req->e.swqe->psn, req->e.swqe->lpsn,
+ req);
+ trace_hfi1_tid_flow_rcv_read_resp(qp, req->clear_tail, flow);
+
+ /* Release the tid resources */
+ hfi1_kern_exp_rcv_clear(req);
+
+ if (!do_rc_ack(qp, aeth, ipsn, opcode, 0, rcd))
+ goto ack_done;
+
+ /* If not done yet, build next read request */
+ if (++req->comp_seg >= req->total_segs) {
+ priv->tid_r_comp++;
+ req->state = TID_REQUEST_COMPLETE;
+ }
+
+ /*
+ * Clear the hw flow under two conditions:
+ * 1. This request is a sync point and it is complete;
+ * 2. Current request is completed and there are no more requests.
+ */
+ if ((req->state == TID_REQUEST_SYNC &&
+ req->comp_seg == req->cur_seg) ||
+ priv->tid_r_comp == priv->tid_r_reqs) {
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+ priv->s_flags &= ~HFI1_R_TID_SW_PSN;
+ if (req->state == TID_REQUEST_SYNC)
+ req->state = TID_REQUEST_ACTIVE;
+ }
+
+ hfi1_schedule_send(qp);
+ goto ack_done;
+
+ack_op_err:
+ /*
+ * The test indicates that the send engine has finished its cleanup
+ * after sending the request and it's now safe to put the QP into error
+ * state. However, if the wqe queue is empty (qp->s_acked == qp->s_tail
+ * == qp->s_head), it would be unsafe to complete the wqe pointed by
+ * qp->s_acked here. Putting the qp into error state will safely flush
+ * all remaining requests.
+ */
+ if (qp->s_last == qp->s_acked)
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+
+ack_done:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ u32 n = qp->s_acked;
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req;
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ /* Free any TID entries */
+ while (n != qp->s_tail) {
+ wqe = rvt_get_swqe_ptr(qp, n);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ req = wqe_to_tid_req(wqe);
+ hfi1_kern_exp_rcv_clear_all(req);
+ }
+
+ if (++n == qp->s_size)
+ n = 0;
+ }
+ /* Free flow */
+ hfi1_kern_clear_hw_flow(priv->rcd, qp);
+}
+
+static bool tid_rdma_tid_err(struct hfi1_packet *packet, u8 rcv_type)
+{
+ struct rvt_qp *qp = packet->qp;
+
+ if (rcv_type >= RHF_RCV_TYPE_IB)
+ goto done;
+
+ spin_lock(&qp->s_lock);
+
+ /*
+ * We've ran out of space in the eager buffer.
+ * Eagerly received KDETH packets which require space in the
+ * Eager buffer (packet that have payload) are TID RDMA WRITE
+ * response packets. In this case, we have to re-transmit the
+ * TID RDMA WRITE request.
+ */
+ if (rcv_type == RHF_RCV_TYPE_EAGER) {
+ hfi1_restart_rc(qp, qp->s_last_psn + 1, 1);
+ hfi1_schedule_send(qp);
+ }
+
+ /* Since no payload is delivered, just drop the packet */
+ spin_unlock(&qp->s_lock);
+done:
+ return true;
+}
+
+static void restart_tid_rdma_read_req(struct hfi1_ctxtdata *rcd,
+ struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+
+ /* Start from the right segment */
+ qp->r_flags |= RVT_R_RDMAR_SEQ;
+ req = wqe_to_tid_req(wqe);
+ flow = &req->flows[req->clear_tail];
+ hfi1_restart_rc(qp, flow->flow_state.ib_spsn, 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_SEND;
+ rvt_get_qp(qp);
+ list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+ }
+}
+
+/*
+ * Handle the KDETH eflags for TID RDMA READ response.
+ *
+ * Return true if the last packet for a segment has been received and it is
+ * time to process the response normally; otherwise, return true.
+ *
+ * The caller must hold the packet->qp->r_lock and the rcu_read_lock.
+ */
+static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+ struct hfi1_packet *packet, u8 rcv_type,
+ u8 rte, u32 psn, u32 ibpsn)
+ __must_hold(&packet->qp->r_lock) __must_hold(RCU)
+{
+ struct hfi1_pportdata *ppd = rcd->ppd;
+ struct hfi1_devdata *dd = ppd->dd;
+ struct hfi1_ibport *ibp;
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 ack_psn;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ bool ret = true;
+ int diff = 0;
+ u32 fpsn;
+
+ lockdep_assert_held(&qp->r_lock);
+ trace_hfi1_rsp_read_kdeth_eflags(qp, ibpsn);
+ trace_hfi1_sender_read_kdeth_eflags(qp);
+ trace_hfi1_tid_read_sender_kdeth_eflags(qp, 0);
+ spin_lock(&qp->s_lock);
+ /* If the psn is out of valid range, drop the packet */
+ if (cmp_psn(ibpsn, qp->s_last_psn) < 0 ||
+ cmp_psn(ibpsn, qp->s_psn) > 0)
+ goto s_unlock;
+
+ /*
+ * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+ * requests and implicitly NAK RDMA read and atomic requests issued
+ * before the NAK'ed request.
+ */
+ ack_psn = ibpsn - 1;
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ ibp = to_iport(qp->ibqp.device, qp->port_num);
+
+ /* Complete WQEs that the PSN finishes. */
+ while ((int)delta_psn(ack_psn, wqe->lpsn) >= 0) {
+ /*
+ * If this request is a RDMA read or atomic, and the NACK is
+ * for a later operation, this NACK NAKs the RDMA read or
+ * atomic.
+ */
+ if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_TID_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+ wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+ /* Retry this request. */
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) {
+ qp->r_flags |= RVT_R_RDMAR_SEQ;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ restart_tid_rdma_read_req(rcd, qp,
+ wqe);
+ } else {
+ hfi1_restart_rc(qp, qp->s_last_psn + 1,
+ 0);
+ if (list_empty(&qp->rspwait)) {
+ qp->r_flags |= RVT_R_RSP_SEND;
+ rvt_get_qp(qp);
+ list_add_tail(/* wait */
+ &qp->rspwait,
+ &rcd->qp_wait_list);
+ }
+ }
+ }
+ /*
+ * No need to process the NAK since we are
+ * restarting an earlier request.
+ */
+ break;
+ }
+
+ wqe = do_rc_completion(qp, wqe, ibp);
+ if (qp->s_acked == qp->s_tail)
+ goto s_unlock;
+ }
+
+ if (qp->s_acked == qp->s_tail)
+ goto s_unlock;
+
+ /* Handle the eflags for the request */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+ goto s_unlock;
+
+ req = wqe_to_tid_req(wqe);
+ trace_hfi1_tid_req_read_kdeth_eflags(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ switch (rcv_type) {
+ case RHF_RCV_TYPE_EXPECTED:
+ switch (rte) {
+ case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
+ /*
+ * On the first occurrence of a Flow Sequence error,
+ * the flag TID_FLOW_SW_PSN is set.
+ *
+ * After that, the flow is *not* reprogrammed and the
+ * protocol falls back to SW PSN checking. This is done
+ * to prevent continuous Flow Sequence errors for any
+ * packets that could be still in the fabric.
+ */
+ flow = &req->flows[req->clear_tail];
+ trace_hfi1_tid_flow_read_kdeth_eflags(qp,
+ req->clear_tail,
+ flow);
+ if (priv->s_flags & HFI1_R_TID_SW_PSN) {
+ diff = cmp_psn(psn,
+ flow->flow_state.r_next_psn);
+ if (diff > 0) {
+ /* Drop the packet.*/
+ goto s_unlock;
+ } else if (diff < 0) {
+ /*
+ * If a response packet for a restarted
+ * request has come back, reset the
+ * restart flag.
+ */
+ if (qp->r_flags & RVT_R_RDMAR_SEQ)
+ qp->r_flags &=
+ ~RVT_R_RDMAR_SEQ;
+
+ /* Drop the packet.*/
+ goto s_unlock;
+ }
+
+ /*
+ * If SW PSN verification is successful and
+ * this is the last packet in the segment, tell
+ * the caller to process it as a normal packet.
+ */
+ fpsn = full_flow_psn(flow,
+ flow->flow_state.lpsn);
+ if (cmp_psn(fpsn, psn) == 0) {
+ ret = false;
+ if (qp->r_flags & RVT_R_RDMAR_SEQ)
+ qp->r_flags &=
+ ~RVT_R_RDMAR_SEQ;
+ }
+ flow->flow_state.r_next_psn =
+ mask_psn(psn + 1);
+ } else {
+ u32 last_psn;
+
+ last_psn = read_r_next_psn(dd, rcd->ctxt,
+ flow->idx);
+ flow->flow_state.r_next_psn = last_psn;
+ priv->s_flags |= HFI1_R_TID_SW_PSN;
+ /*
+ * If no request has been restarted yet,
+ * restart the current one.
+ */
+ if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
+ restart_tid_rdma_read_req(rcd, qp,
+ wqe);
+ }
+
+ break;
+
+ case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
+ /*
+ * Since the TID flow is able to ride through
+ * generation mismatch, drop this stale packet.
+ */
+ break;
+
+ default:
+ break;
+ }
+ break;
+
+ case RHF_RCV_TYPE_ERROR:
+ switch (rte) {
+ case RHF_RTE_ERROR_OP_CODE_ERR:
+ case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
+ case RHF_RTE_ERROR_KHDR_HCRC_ERR:
+ case RHF_RTE_ERROR_KHDR_KVER_ERR:
+ case RHF_RTE_ERROR_CONTEXT_ERR:
+ case RHF_RTE_ERROR_KHDR_TID_ERR:
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+s_unlock:
+ spin_unlock(&qp->s_lock);
+ return ret;
+}
+
+bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+ struct hfi1_pportdata *ppd,
+ struct hfi1_packet *packet)
+{
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+ struct hfi1_devdata *dd = ppd->dd;
+ struct rvt_dev_info *rdi = &dd->verbs_dev.rdi;
+ u8 rcv_type = rhf_rcv_type(packet->rhf);
+ u8 rte = rhf_rcv_type_err(packet->rhf);
+ struct ib_header *hdr = packet->hdr;
+ struct ib_other_headers *ohdr = NULL;
+ int lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ u16 lid = be16_to_cpu(hdr->lrh[1]);
+ u8 opcode;
+ u32 qp_num, psn, ibpsn;
+ struct rvt_qp *qp;
+ struct hfi1_qp_priv *qpriv;
+ unsigned long flags;
+ bool ret = true;
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ int diff = 0;
+
+ trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
+ packet->rhf);
+ if (packet->rhf & RHF_ICRC_ERR)
+ return ret;
+
+ packet->ohdr = &hdr->u.oth;
+ ohdr = packet->ohdr;
+ trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+ /* Get the destination QP number. */
+ qp_num = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_qp) &
+ RVT_QPN_MASK;
+ if (lid >= be16_to_cpu(IB_MULTICAST_LID_BASE))
+ goto drop;
+
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+ rcu_read_lock();
+ qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+ if (!qp)
+ goto rcu_unlock;
+
+ packet->qp = qp;
+
+ /* Check for valid receive state. */
+ spin_lock_irqsave(&qp->r_lock, flags);
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
+ ibp->rvp.n_pkt_drops++;
+ goto r_unlock;
+ }
+
+ if (packet->rhf & RHF_TID_ERR) {
+ /* For TIDERR and RC QPs preemptively schedule a NAK */
+ u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+
+ /* Sanity check packet */
+ if (tlen < 24)
+ goto r_unlock;
+
+ /*
+ * Check for GRH. We should never get packets with GRH in this
+ * path.
+ */
+ if (lnh == HFI1_LRH_GRH)
+ goto r_unlock;
+
+ if (tid_rdma_tid_err(packet, rcv_type))
+ goto r_unlock;
+ }
+
+ /* handle TID RDMA READ */
+ if (opcode == TID_OP(READ_RESP)) {
+ ibpsn = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.verbs_psn);
+ ibpsn = mask_psn(ibpsn);
+ ret = handle_read_kdeth_eflags(rcd, packet, rcv_type, rte, psn,
+ ibpsn);
+ goto r_unlock;
+ }
+
+ /*
+ * qp->s_tail_ack_queue points to the rvt_ack_entry currently being
+ * processed. These a completed sequentially so we can be sure that
+ * the pointer will not change until the entire request has completed.
+ */
+ spin_lock(&qp->s_lock);
+ qpriv = qp->priv;
+ if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID ||
+ qpriv->r_tid_tail == qpriv->r_tid_head)
+ goto unlock;
+ e = &qp->s_ack_queue[qpriv->r_tid_tail];
+ if (e->opcode != TID_OP(WRITE_REQ))
+ goto unlock;
+ req = ack_to_tid_req(e);
+ if (req->comp_seg == req->cur_seg)
+ goto unlock;
+ flow = &req->flows[req->clear_tail];
+ trace_hfi1_eflags_err_write(qp, rcv_type, rte, psn);
+ trace_hfi1_rsp_handle_kdeth_eflags(qp, psn);
+ trace_hfi1_tid_write_rsp_handle_kdeth_eflags(qp);
+ trace_hfi1_tid_req_handle_kdeth_eflags(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ trace_hfi1_tid_flow_handle_kdeth_eflags(qp, req->clear_tail, flow);
+
+ switch (rcv_type) {
+ case RHF_RCV_TYPE_EXPECTED:
+ switch (rte) {
+ case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
+ if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) {
+ qpriv->s_flags |= HFI1_R_TID_SW_PSN;
+ flow->flow_state.r_next_psn =
+ read_r_next_psn(dd, rcd->ctxt,
+ flow->idx);
+ qpriv->r_next_psn_kdeth =
+ flow->flow_state.r_next_psn;
+ goto nak_psn;
+ } else {
+ /*
+ * If the received PSN does not match the next
+ * expected PSN, NAK the packet.
+ * However, only do that if we know that the a
+ * NAK has already been sent. Otherwise, this
+ * mismatch could be due to packets that were
+ * already in flight.
+ */
+ diff = cmp_psn(psn,
+ flow->flow_state.r_next_psn);
+ if (diff > 0)
+ goto nak_psn;
+ else if (diff < 0)
+ break;
+
+ qpriv->s_nak_state = 0;
+ /*
+ * If SW PSN verification is successful and this
+ * is the last packet in the segment, tell the
+ * caller to process it as a normal packet.
+ */
+ if (psn == full_flow_psn(flow,
+ flow->flow_state.lpsn))
+ ret = false;
+ flow->flow_state.r_next_psn =
+ mask_psn(psn + 1);
+ qpriv->r_next_psn_kdeth =
+ flow->flow_state.r_next_psn;
+ }
+ break;
+
+ case RHF_RTE_EXPECTED_FLOW_GEN_ERR:
+ goto nak_psn;
+
+ default:
+ break;
+ }
+ break;
+
+ case RHF_RCV_TYPE_ERROR:
+ switch (rte) {
+ case RHF_RTE_ERROR_OP_CODE_ERR:
+ case RHF_RTE_ERROR_KHDR_MIN_LEN_ERR:
+ case RHF_RTE_ERROR_KHDR_HCRC_ERR:
+ case RHF_RTE_ERROR_KHDR_KVER_ERR:
+ case RHF_RTE_ERROR_CONTEXT_ERR:
+ case RHF_RTE_ERROR_KHDR_TID_ERR:
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+
+unlock:
+ spin_unlock(&qp->s_lock);
+r_unlock:
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+rcu_unlock:
+ rcu_read_unlock();
+drop:
+ return ret;
+nak_psn:
+ ibp->rvp.n_rc_seqnak++;
+ if (!qpriv->s_nak_state) {
+ qpriv->s_nak_state = IB_NAK_PSN_ERROR;
+ /* We are NAK'ing the next expected PSN */
+ qpriv->s_nak_psn = mask_psn(flow->flow_state.r_next_psn);
+ tid_rdma_trigger_ack(qp);
+ }
+ goto unlock;
+}
+
+/*
+ * "Rewind" the TID request information.
+ * This means that we reset the state back to ACTIVE,
+ * find the proper flow, set the flow index to that flow,
+ * and reset the flow information.
+ */
+void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ u32 *bth2)
+{
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ int diff, delta_pkts;
+ u32 tididx = 0, i;
+ u16 fidx;
+
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ *bth2 = mask_psn(qp->s_psn);
+ flow = find_flow_ib(req, *bth2, &fidx);
+ if (!flow) {
+ trace_hfi1_msg_tid_restart_req(/* msg */
+ qp, "!!!!!! Could not find flow to restart: bth2 ",
+ (u64)*bth2);
+ trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ req);
+ return;
+ }
+ } else {
+ fidx = req->acked_tail;
+ flow = &req->flows[fidx];
+ *bth2 = mask_psn(req->r_ack_psn);
+ }
+
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ delta_pkts = delta_psn(*bth2, flow->flow_state.ib_spsn);
+ else
+ delta_pkts = delta_psn(*bth2,
+ full_flow_psn(flow,
+ flow->flow_state.spsn));
+
+ trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
+ diff = delta_pkts + flow->resync_npkts;
+
+ flow->sent = 0;
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->tid_offset = 0;
+ if (diff) {
+ for (tididx = 0; tididx < flow->tidcnt; tididx++) {
+ u32 tidentry = flow->tid_entry[tididx], tidlen,
+ tidnpkts, npkts;
+
+ flow->tid_offset = 0;
+ tidlen = EXP_TID_GET(tidentry, LEN) * PAGE_SIZE;
+ tidnpkts = rvt_div_round_up_mtu(qp, tidlen);
+ npkts = min_t(u32, diff, tidnpkts);
+ flow->pkt += npkts;
+ flow->sent += (npkts == tidnpkts ? tidlen :
+ npkts * qp->pmtu);
+ flow->tid_offset += npkts * qp->pmtu;
+ diff -= npkts;
+ if (!diff)
+ break;
+ }
+ }
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+ rvt_skip_sge(&qpriv->tid_ss, (req->cur_seg * req->seg_len) +
+ flow->sent, 0);
+ /*
+ * Packet PSN is based on flow_state.spsn + flow->pkt. However,
+ * during a RESYNC, the generation is incremented and the
+ * sequence is reset to 0. Since we've adjusted the npkts in the
+ * flow and the SGE has been sufficiently advanced, we have to
+ * adjust flow->pkt in order to calculate the correct PSN.
+ */
+ flow->pkt -= flow->resync_npkts;
+ }
+
+ if (flow->tid_offset ==
+ EXP_TID_GET(flow->tid_entry[tididx], LEN) * PAGE_SIZE) {
+ tididx++;
+ flow->tid_offset = 0;
+ }
+ flow->tid_idx = tididx;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ)
+ /* Move flow_idx to correct index */
+ req->flow_idx = fidx;
+ else
+ req->clear_tail = fidx;
+
+ trace_hfi1_tid_flow_restart_req(qp, fidx, flow);
+ trace_hfi1_tid_req_restart_req(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ req->state = TID_REQUEST_ACTIVE;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE) {
+ /* Reset all the flows that we are going to resend */
+ fidx = CIRC_NEXT(fidx, MAX_FLOWS);
+ i = qpriv->s_tid_tail;
+ do {
+ for (; CIRC_CNT(req->setup_head, fidx, MAX_FLOWS);
+ fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
+ req->flows[fidx].sent = 0;
+ req->flows[fidx].pkt = 0;
+ req->flows[fidx].tid_idx = 0;
+ req->flows[fidx].tid_offset = 0;
+ req->flows[fidx].resync_npkts = 0;
+ }
+ if (i == qpriv->s_tid_cur)
+ break;
+ do {
+ i = (++i == qp->s_size ? 0 : i);
+ wqe = rvt_get_swqe_ptr(qp, i);
+ } while (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE);
+ req = wqe_to_tid_req(wqe);
+ req->cur_seg = req->ack_seg;
+ fidx = req->acked_tail;
+ /* Pull req->clear_tail back */
+ req->clear_tail = fidx;
+ } while (1);
+ }
+}
+
+void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp)
+{
+ int i, ret;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_flow_state *fs;
+
+ if (qp->ibqp.qp_type != IB_QPT_RC || !HFI1_CAP_IS_KSET(TID_RDMA))
+ return;
+
+ /*
+ * First, clear the flow to help prevent any delayed packets from
+ * being delivered.
+ */
+ fs = &qpriv->flow_state;
+ if (fs->index != RXE_NUM_TID_FLOWS)
+ hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
+
+ for (i = qp->s_acked; i != qp->s_head;) {
+ struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, i);
+
+ if (++i == qp->s_size)
+ i = 0;
+ /* Free only locally allocated TID entries */
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_READ)
+ continue;
+ do {
+ struct hfi1_swqe_priv *priv = wqe->priv;
+
+ ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
+ } while (!ret);
+ }
+ for (i = qp->s_acked_ack_queue; i != qp->r_head_ack_queue;) {
+ struct rvt_ack_entry *e = &qp->s_ack_queue[i];
+
+ if (++i == rvt_max_atomic(ib_to_rvt(qp->ibqp.device)))
+ i = 0;
+ /* Free only locally allocated TID entries */
+ if (e->opcode != TID_OP(WRITE_REQ))
+ continue;
+ do {
+ struct hfi1_ack_priv *priv = e->priv;
+
+ ret = hfi1_kern_exp_rcv_clear(&priv->tid_req);
+ } while (!ret);
+ }
+}
+
+bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct rvt_swqe *prev;
+ struct hfi1_qp_priv *priv = qp->priv;
+ u32 s_prev;
+ struct tid_rdma_request *req;
+
+ s_prev = (qp->s_cur == 0 ? qp->s_size : qp->s_cur) - 1;
+ prev = rvt_get_swqe_ptr(qp, s_prev);
+
+ switch (wqe->wr.opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ case IB_WR_SEND_WITH_INV:
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ case IB_WR_RDMA_WRITE:
+ switch (prev->wr.opcode) {
+ case IB_WR_TID_RDMA_WRITE:
+ req = wqe_to_tid_req(prev);
+ if (req->ack_seg != req->total_segs)
+ goto interlock;
+ default:
+ break;
+ }
+ break;
+ case IB_WR_RDMA_READ:
+ if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE)
+ break;
+ /* fall through */
+ case IB_WR_TID_RDMA_READ:
+ switch (prev->wr.opcode) {
+ case IB_WR_RDMA_READ:
+ if (qp->s_acked != qp->s_cur)
+ goto interlock;
+ break;
+ case IB_WR_TID_RDMA_WRITE:
+ req = wqe_to_tid_req(prev);
+ if (req->ack_seg != req->total_segs)
+ goto interlock;
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+ return false;
+
+interlock:
+ priv->s_flags |= HFI1_S_TID_WAIT_INTERLCK;
+ return true;
+}
+
+/* Does @sge meet the alignment requirements for tid rdma? */
+static inline bool hfi1_check_sge_align(struct rvt_qp *qp,
+ struct rvt_sge *sge, int num_sge)
+{
+ int i;
+
+ for (i = 0; i < num_sge; i++, sge++) {
+ trace_hfi1_sge_check_align(qp, i, sge);
+ if ((u64)sge->vaddr & ~PAGE_MASK ||
+ sge->sge_length & ~PAGE_MASK)
+ return false;
+ }
+ return true;
+}
+
+void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ struct hfi1_qp_priv *qpriv = (struct hfi1_qp_priv *)qp->priv;
+ struct hfi1_swqe_priv *priv = wqe->priv;
+ struct tid_rdma_params *remote;
+ enum ib_wr_opcode new_opcode;
+ bool do_tid_rdma = false;
+ struct hfi1_pportdata *ppd = qpriv->rcd->ppd;
+
+ if ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) ==
+ ppd->lid)
+ return;
+ if (qpriv->hdr_type != HFI1_PKT_TYPE_9B)
+ return;
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ /*
+ * If TID RDMA is disabled by the negotiation, don't
+ * use it.
+ */
+ if (!remote)
+ goto exit;
+
+ if (wqe->wr.opcode == IB_WR_RDMA_READ) {
+ if (hfi1_check_sge_align(qp, &wqe->sg_list[0],
+ wqe->wr.num_sge)) {
+ new_opcode = IB_WR_TID_RDMA_READ;
+ do_tid_rdma = true;
+ }
+ } else if (wqe->wr.opcode == IB_WR_RDMA_WRITE) {
+ /*
+ * TID RDMA is enabled for this RDMA WRITE request iff:
+ * 1. The remote address is page-aligned,
+ * 2. The length is larger than the minimum segment size,
+ * 3. The length is page-multiple.
+ */
+ if (!(wqe->rdma_wr.remote_addr & ~PAGE_MASK) &&
+ !(wqe->length & ~PAGE_MASK)) {
+ new_opcode = IB_WR_TID_RDMA_WRITE;
+ do_tid_rdma = true;
+ }
+ }
+
+ if (do_tid_rdma) {
+ if (hfi1_kern_exp_rcv_alloc_flows(&priv->tid_req, GFP_ATOMIC))
+ goto exit;
+ wqe->wr.opcode = new_opcode;
+ priv->tid_req.seg_len =
+ min_t(u32, remote->max_len, wqe->length);
+ priv->tid_req.total_segs =
+ DIV_ROUND_UP(wqe->length, priv->tid_req.seg_len);
+ /* Compute the last PSN of the request */
+ wqe->lpsn = wqe->psn;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_READ) {
+ priv->tid_req.n_flows = remote->max_read;
+ qpriv->tid_r_reqs++;
+ wqe->lpsn += rvt_div_round_up_mtu(qp, wqe->length) - 1;
+ } else {
+ wqe->lpsn += priv->tid_req.total_segs - 1;
+ atomic_inc(&qpriv->n_requests);
+ }
+
+ priv->tid_req.cur_seg = 0;
+ priv->tid_req.comp_seg = 0;
+ priv->tid_req.ack_seg = 0;
+ priv->tid_req.state = TID_REQUEST_INACTIVE;
+ /*
+ * Reset acked_tail.
+ * TID RDMA READ does not have ACKs so it does not
+ * update the pointer. We have to reset it so TID RDMA
+ * WRITE does not get confused.
+ */
+ priv->tid_req.acked_tail = priv->tid_req.setup_head;
+ trace_hfi1_tid_req_setup_tid_wqe(qp, 1, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn,
+ &priv->tid_req);
+ }
+exit:
+ rcu_read_unlock();
+}
+
+/* TID RDMA WRITE functions */
+
+u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_params *remote;
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ /*
+ * Set the number of flow to be used based on negotiated
+ * parameters.
+ */
+ req->n_flows = remote->max_write;
+ req->state = TID_REQUEST_ACTIVE;
+
+ KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth0, KVER, 0x1);
+ KDETH_RESET(ohdr->u.tid_rdma.w_req.kdeth1, JKEY, remote->jkey);
+ ohdr->u.tid_rdma.w_req.reth.vaddr =
+ cpu_to_be64(wqe->rdma_wr.remote_addr + (wqe->length - *len));
+ ohdr->u.tid_rdma.w_req.reth.rkey =
+ cpu_to_be32(wqe->rdma_wr.rkey);
+ ohdr->u.tid_rdma.w_req.reth.length = cpu_to_be32(*len);
+ ohdr->u.tid_rdma.w_req.verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 &= ~RVT_QPN_MASK;
+ *bth1 |= remote->qp;
+ qp->s_state = TID_OP(WRITE_REQ);
+ qp->s_flags |= HFI1_S_WAIT_TID_RESP;
+ *bth2 |= IB_BTH_REQ_ACK;
+ *len = 0;
+
+ rcu_read_unlock();
+ return sizeof(ohdr->u.tid_rdma.w_req) / sizeof(u32);
+}
+
+static u32 hfi1_compute_tid_rdma_flow_wt(struct rvt_qp *qp)
+{
+ /*
+ * Heuristic for computing the RNR timeout when waiting on the flow
+ * queue. Rather than a computationaly expensive exact estimate of when
+ * a flow will be available, we assume that if a QP is at position N in
+ * the flow queue it has to wait approximately (N + 1) * (number of
+ * segments between two sync points). The rationale for this is that
+ * flows are released and recycled at each sync point.
+ */
+ return (MAX_TID_FLOW_PSN * qp->pmtu) >> TID_RDMA_SEGMENT_SHIFT;
+}
+
+static u32 position_in_queue(struct hfi1_qp_priv *qpriv,
+ struct tid_queue *queue)
+{
+ return qpriv->tid_enqueue - queue->dequeue;
+}
+
+/*
+ * @qp: points to rvt_qp context.
+ * @to_seg: desired RNR timeout in segments.
+ * Return: index of the next highest timeout in the ib_hfi1_rnr_table[]
+ */
+static u32 hfi1_compute_tid_rnr_timeout(struct rvt_qp *qp, u32 to_seg)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ u64 timeout;
+ u32 bytes_per_us;
+ u8 i;
+
+ bytes_per_us = active_egress_rate(qpriv->rcd->ppd) / 8;
+ timeout = (to_seg * TID_RDMA_MAX_SEGMENT_SIZE) / bytes_per_us;
+ /*
+ * Find the next highest value in the RNR table to the required
+ * timeout. This gives the responder some padding.
+ */
+ for (i = 1; i <= IB_AETH_CREDIT_MASK; i++)
+ if (rvt_rnr_tbl_to_usec(i) >= timeout)
+ return i;
+ return 0;
+}
+
+/**
+ * Central place for resource allocation at TID write responder,
+ * is called from write_req and write_data interrupt handlers as
+ * well as the send thread when a queued QP is scheduled for
+ * resource allocation.
+ *
+ * Iterates over (a) segments of a request and then (b) queued requests
+ * themselves to allocate resources for up to local->max_write
+ * segments across multiple requests. Stop allocating when we
+ * hit a sync point, resume allocating after data packets at
+ * sync point have been received.
+ *
+ * Resource allocation and sending of responses is decoupled. The
+ * request/segment which are being allocated and sent are as follows.
+ * Resources are allocated for:
+ * [request: qpriv->r_tid_alloc, segment: req->alloc_seg]
+ * The send thread sends:
+ * [request: qp->s_tail_ack_queue, segment:req->cur_seg]
+ */
+static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
+{
+ struct tid_rdma_request *req;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_ctxtdata *rcd = qpriv->rcd;
+ struct tid_rdma_params *local = &qpriv->tid_rdma.local;
+ struct rvt_ack_entry *e;
+ u32 npkts, to_seg;
+ bool last;
+ int ret = 0;
+
+ lockdep_assert_held(&qp->s_lock);
+
+ while (1) {
+ trace_hfi1_rsp_tid_write_alloc_res(qp, 0);
+ trace_hfi1_tid_write_rsp_alloc_res(qp);
+ /*
+ * Don't allocate more segments if a RNR NAK has already been
+ * scheduled to avoid messing up qp->r_psn: the RNR NAK will
+ * be sent only when all allocated segments have been sent.
+ * However, if more segments are allocated before that, TID RDMA
+ * WRITE RESP packets will be sent out for these new segments
+ * before the RNR NAK packet. When the requester receives the
+ * RNR NAK packet, it will restart with qp->s_last_psn + 1,
+ * which does not match qp->r_psn and will be dropped.
+ * Consequently, the requester will exhaust its retries and
+ * put the qp into error state.
+ */
+ if (qpriv->rnr_nak_state == TID_RNR_NAK_SEND)
+ break;
+
+ /* No requests left to process */
+ if (qpriv->r_tid_alloc == qpriv->r_tid_head) {
+ /* If all data has been received, clear the flow */
+ if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS &&
+ !qpriv->alloc_w_segs) {
+ hfi1_kern_clear_hw_flow(rcd, qp);
+ qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+ }
+ break;
+ }
+
+ e = &qp->s_ack_queue[qpriv->r_tid_alloc];
+ if (e->opcode != TID_OP(WRITE_REQ))
+ goto next_req;
+ req = ack_to_tid_req(e);
+ trace_hfi1_tid_req_write_alloc_res(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+ /* Finished allocating for all segments of this request */
+ if (req->alloc_seg >= req->total_segs)
+ goto next_req;
+
+ /* Can allocate only a maximum of local->max_write for a QP */
+ if (qpriv->alloc_w_segs >= local->max_write)
+ break;
+
+ /* Don't allocate at a sync point with data packets pending */
+ if (qpriv->sync_pt && qpriv->alloc_w_segs)
+ break;
+
+ /* All data received at the sync point, continue */
+ if (qpriv->sync_pt && !qpriv->alloc_w_segs) {
+ hfi1_kern_clear_hw_flow(rcd, qp);
+ qpriv->sync_pt = false;
+ qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+ }
+
+ /* Allocate flow if we don't have one */
+ if (qpriv->flow_state.index >= RXE_NUM_TID_FLOWS) {
+ ret = hfi1_kern_setup_hw_flow(qpriv->rcd, qp);
+ if (ret) {
+ to_seg = hfi1_compute_tid_rdma_flow_wt(qp) *
+ position_in_queue(qpriv,
+ &rcd->flow_queue);
+ break;
+ }
+ }
+
+ npkts = rvt_div_round_up_mtu(qp, req->seg_len);
+
+ /*
+ * We are at a sync point if we run out of KDETH PSN space.
+ * Last PSN of every generation is reserved for RESYNC.
+ */
+ if (qpriv->flow_state.psn + npkts > MAX_TID_FLOW_PSN - 1) {
+ qpriv->sync_pt = true;
+ break;
+ }
+
+ /*
+ * If overtaking req->acked_tail, send an RNR NAK. Because the
+ * QP is not queued in this case, and the issue can only be
+ * caused by a delay in scheduling the second leg which we
+ * cannot estimate, we use a rather arbitrary RNR timeout of
+ * (MAX_FLOWS / 2) segments
+ */
+ if (!CIRC_SPACE(req->setup_head, req->acked_tail,
+ MAX_FLOWS)) {
+ ret = -EAGAIN;
+ to_seg = MAX_FLOWS >> 1;
+ tid_rdma_trigger_ack(qp);
+ break;
+ }
+
+ /* Try to allocate rcv array / TID entries */
+ ret = hfi1_kern_exp_rcv_setup(req, &req->ss, &last);
+ if (ret == -EAGAIN)
+ to_seg = position_in_queue(qpriv, &rcd->rarr_queue);
+ if (ret)
+ break;
+
+ qpriv->alloc_w_segs++;
+ req->alloc_seg++;
+ continue;
+next_req:
+ /* Begin processing the next request */
+ if (++qpriv->r_tid_alloc >
+ rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ qpriv->r_tid_alloc = 0;
+ }
+
+ /*
+ * Schedule an RNR NAK to be sent if (a) flow or rcv array allocation
+ * has failed (b) we are called from the rcv handler interrupt context
+ * (c) an RNR NAK has not already been scheduled
+ */
+ if (ret == -EAGAIN && intr_ctx && !qp->r_nak_state)
+ goto send_rnr_nak;
+
+ return;
+
+send_rnr_nak:
+ lockdep_assert_held(&qp->r_lock);
+
+ /* Set r_nak_state to prevent unrelated events from generating NAK's */
+ qp->r_nak_state = hfi1_compute_tid_rnr_timeout(qp, to_seg) | IB_RNR_NAK;
+
+ /* Pull back r_psn to the segment being RNR NAK'd */
+ qp->r_psn = e->psn + req->alloc_seg;
+ qp->r_ack_psn = qp->r_psn;
+ /*
+ * Pull back r_head_ack_queue to the ack entry following the request
+ * being RNR NAK'd. This allows resources to be allocated to the request
+ * if the queued QP is scheduled.
+ */
+ qp->r_head_ack_queue = qpriv->r_tid_alloc + 1;
+ if (qp->r_head_ack_queue > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ qp->r_head_ack_queue = 0;
+ qpriv->r_tid_head = qp->r_head_ack_queue;
+ /*
+ * These send side fields are used in make_rc_ack(). They are set in
+ * hfi1_send_rc_ack() but must be set here before dropping qp->s_lock
+ * for consistency
+ */
+ qp->s_nak_state = qp->r_nak_state;
+ qp->s_ack_psn = qp->r_ack_psn;
+ /*
+ * Clear the ACK PENDING flag to prevent unwanted ACK because we
+ * have modified qp->s_ack_psn here.
+ */
+ qp->s_flags &= ~(RVT_S_ACK_PENDING);
+
+ trace_hfi1_rsp_tid_write_alloc_res(qp, qp->r_psn);
+ /*
+ * qpriv->rnr_nak_state is used to determine when the scheduled RNR NAK
+ * has actually been sent. qp->s_flags RVT_S_ACK_PENDING bit cannot be
+ * used for this because qp->s_lock is dropped before calling
+ * hfi1_send_rc_ack() leading to inconsistency between the receive
+ * interrupt handlers and the send thread in make_rc_ack()
+ */
+ qpriv->rnr_nak_state = TID_RNR_NAK_SEND;
+
+ /*
+ * Schedule RNR NAK to be sent. RNR NAK's are scheduled from the receive
+ * interrupt handlers but will be sent from the send engine behind any
+ * previous responses that may have been scheduled
+ */
+ rc_defered_ack(rcd, qp);
+}
+
+void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
+{
+ /* HANDLER FOR TID RDMA WRITE REQUEST packet (Responder side)*/
+
+ /*
+ * 1. Verify TID RDMA WRITE REQ as per IB_OPCODE_RC_RDMA_WRITE_FIRST
+ * (see hfi1_rc_rcv())
+ * - Don't allow 0-length requests.
+ * 2. Put TID RDMA WRITE REQ into the response queueu (s_ack_queue)
+ * - Setup struct tid_rdma_req with request info
+ * - Prepare struct tid_rdma_flow array?
+ * 3. Set the qp->s_ack_state as state diagram in design doc.
+ * 4. Set RVT_S_RESP_PENDING in s_flags.
+ * 5. Kick the send engine (hfi1_schedule_send())
+ */
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_ack_entry *e;
+ unsigned long flags;
+ struct ib_reth *reth;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_request *req;
+ u32 bth0, psn, len, rkey, num_segs;
+ bool fecn;
+ u8 next;
+ u64 vaddr;
+ int diff;
+
+ bth0 = be32_to_cpu(ohdr->bth[0]);
+ if (hfi1_ruc_check_hdr(ibp, packet))
+ return;
+
+ fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ trace_hfi1_rsp_rcv_tid_write_req(qp, psn);
+
+ if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST))
+ rvt_comm_est(qp);
+
+ if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+ goto nack_inv;
+
+ reth = &ohdr->u.tid_rdma.w_req.reth;
+ vaddr = be64_to_cpu(reth->vaddr);
+ len = be32_to_cpu(reth->length);
+
+ num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len);
+ diff = delta_psn(psn, qp->r_psn);
+ if (unlikely(diff)) {
+ tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
+ return;
+ }
+
+ /*
+ * The resent request which was previously RNR NAK'd is inserted at the
+ * location of the original request, which is one entry behind
+ * r_head_ack_queue
+ */
+ if (qpriv->rnr_nak_state)
+ qp->r_head_ack_queue = qp->r_head_ack_queue ?
+ qp->r_head_ack_queue - 1 :
+ rvt_size_atomic(ib_to_rvt(qp->ibqp.device));
+
+ /* We've verified the request, insert it into the ack queue. */
+ next = qp->r_head_ack_queue + 1;
+ if (next > rvt_size_atomic(ib_to_rvt(qp->ibqp.device)))
+ next = 0;
+ spin_lock_irqsave(&qp->s_lock, flags);
+ if (unlikely(next == qp->s_acked_ack_queue)) {
+ if (!qp->s_ack_queue[next].sent)
+ goto nack_inv_unlock;
+ update_ack_queue(qp, next);
+ }
+ e = &qp->s_ack_queue[qp->r_head_ack_queue];
+ req = ack_to_tid_req(e);
+
+ /* Bring previously RNR NAK'd request back to life */
+ if (qpriv->rnr_nak_state) {
+ qp->r_nak_state = 0;
+ qp->s_nak_state = 0;
+ qpriv->rnr_nak_state = TID_RNR_NAK_INIT;
+ qp->r_psn = e->lpsn + 1;
+ req->state = TID_REQUEST_INIT;
+ goto update_head;
+ }
+
+ release_rdma_sge_mr(e);
+
+ /* The length needs to be in multiples of PAGE_SIZE */
+ if (!len || len & ~PAGE_MASK)
+ goto nack_inv_unlock;
+
+ rkey = be32_to_cpu(reth->rkey);
+ qp->r_len = len;
+
+ if (e->opcode == TID_OP(WRITE_REQ) &&
+ (req->setup_head != req->clear_tail ||
+ req->clear_tail != req->acked_tail))
+ goto nack_inv_unlock;
+
+ if (unlikely(!rvt_rkey_ok(qp, &e->rdma_sge, qp->r_len, vaddr,
+ rkey, IB_ACCESS_REMOTE_WRITE)))
+ goto nack_acc;
+
+ qp->r_psn += num_segs - 1;
+
+ e->opcode = (bth0 >> 24) & 0xff;
+ e->psn = psn;
+ e->lpsn = qp->r_psn;
+ e->sent = 0;
+
+ req->n_flows = min_t(u16, num_segs, qpriv->tid_rdma.local.max_write);
+ req->state = TID_REQUEST_INIT;
+ req->cur_seg = 0;
+ req->comp_seg = 0;
+ req->ack_seg = 0;
+ req->alloc_seg = 0;
+ req->isge = 0;
+ req->seg_len = qpriv->tid_rdma.local.max_len;
+ req->total_len = len;
+ req->total_segs = num_segs;
+ req->r_flow_psn = e->psn;
+ req->ss.sge = e->rdma_sge;
+ req->ss.num_sge = 1;
+
+ req->flow_idx = req->setup_head;
+ req->clear_tail = req->setup_head;
+ req->acked_tail = req->setup_head;
+
+ qp->r_state = e->opcode;
+ qp->r_nak_state = 0;
+ /*
+ * We need to increment the MSN here instead of when we
+ * finish sending the result since a duplicate request would
+ * increment it more than once.
+ */
+ qp->r_msn++;
+ qp->r_psn++;
+
+ trace_hfi1_tid_req_rcv_write_req(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+
+ if (qpriv->r_tid_tail == HFI1_QP_WQE_INVALID) {
+ qpriv->r_tid_tail = qp->r_head_ack_queue;
+ } else if (qpriv->r_tid_tail == qpriv->r_tid_head) {
+ struct tid_rdma_request *ptr;
+
+ e = &qp->s_ack_queue[qpriv->r_tid_tail];
+ ptr = ack_to_tid_req(e);
+
+ if (e->opcode != TID_OP(WRITE_REQ) ||
+ ptr->comp_seg == ptr->total_segs) {
+ if (qpriv->r_tid_tail == qpriv->r_tid_ack)
+ qpriv->r_tid_ack = qp->r_head_ack_queue;
+ qpriv->r_tid_tail = qp->r_head_ack_queue;
+ }
+ }
+update_head:
+ qp->r_head_ack_queue = next;
+ qpriv->r_tid_head = qp->r_head_ack_queue;
+
+ hfi1_tid_write_alloc_resources(qp, true);
+ trace_hfi1_tid_write_rsp_rcv_req(qp);
+
+ /* Schedule the send tasklet. */
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ if (fecn)
+ qp->s_flags |= RVT_S_ECN;
+ hfi1_schedule_send(qp);
+
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return;
+
+nack_inv_unlock:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+ rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+ qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+ qp->r_ack_psn = qp->r_psn;
+ /* Queue NAK for later */
+ rc_defered_ack(rcd, qp);
+ return;
+nack_acc:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
+ qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+ qp->r_ack_psn = qp->r_psn;
+}
+
+u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 bth2, u32 *len,
+ struct rvt_sge_state **ss)
+{
+ struct hfi1_ack_priv *epriv = e->priv;
+ struct tid_rdma_request *req = &epriv->tid_req;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_flow *flow = NULL;
+ u32 resp_len = 0, hdwords = 0;
+ void *resp_addr = NULL;
+ struct tid_rdma_params *remote;
+
+ trace_hfi1_tid_req_build_write_resp(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ trace_hfi1_tid_write_rsp_build_resp(qp);
+ trace_hfi1_rsp_build_tid_write_resp(qp, bth2);
+ flow = &req->flows[req->flow_idx];
+ switch (req->state) {
+ default:
+ /*
+ * Try to allocate resources here in case QP was queued and was
+ * later scheduled when resources became available
+ */
+ hfi1_tid_write_alloc_resources(qp, false);
+
+ /* We've already sent everything which is ready */
+ if (req->cur_seg >= req->alloc_seg)
+ goto done;
+
+ /*
+ * Resources can be assigned but responses cannot be sent in
+ * rnr_nak state, till the resent request is received
+ */
+ if (qpriv->rnr_nak_state == TID_RNR_NAK_SENT)
+ goto done;
+
+ req->state = TID_REQUEST_ACTIVE;
+ trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
+ req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
+ hfi1_add_tid_reap_timer(qp);
+ break;
+
+ case TID_REQUEST_RESEND_ACTIVE:
+ case TID_REQUEST_RESEND:
+ trace_hfi1_tid_flow_build_write_resp(qp, req->flow_idx, flow);
+ req->flow_idx = CIRC_NEXT(req->flow_idx, MAX_FLOWS);
+ if (!CIRC_CNT(req->setup_head, req->flow_idx, MAX_FLOWS))
+ req->state = TID_REQUEST_ACTIVE;
+
+ hfi1_mod_tid_reap_timer(qp);
+ break;
+ }
+ flow->flow_state.resp_ib_psn = bth2;
+ resp_addr = (void *)flow->tid_entry;
+ resp_len = sizeof(*flow->tid_entry) * flow->tidcnt;
+ req->cur_seg++;
+
+ memset(&ohdr->u.tid_rdma.w_rsp, 0, sizeof(ohdr->u.tid_rdma.w_rsp));
+ epriv->ss.sge.vaddr = resp_addr;
+ epriv->ss.sge.sge_length = resp_len;
+ epriv->ss.sge.length = epriv->ss.sge.sge_length;
+ /*
+ * We can safely zero these out. Since the first SGE covers the
+ * entire packet, nothing else should even look at the MR.
+ */
+ epriv->ss.sge.mr = NULL;
+ epriv->ss.sge.m = 0;
+ epriv->ss.sge.n = 0;
+
+ epriv->ss.sg_list = NULL;
+ epriv->ss.total_len = epriv->ss.sge.sge_length;
+ epriv->ss.num_sge = 1;
+
+ *ss = &epriv->ss;
+ *len = epriv->ss.total_len;
+
+ /* Construct the TID RDMA WRITE RESP packet header */
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+
+ KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth0, KVER, 0x1);
+ KDETH_RESET(ohdr->u.tid_rdma.w_rsp.kdeth1, JKEY, remote->jkey);
+ ohdr->u.tid_rdma.w_rsp.aeth = rvt_compute_aeth(qp);
+ ohdr->u.tid_rdma.w_rsp.tid_flow_psn =
+ cpu_to_be32((flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT) |
+ (flow->flow_state.spsn &
+ HFI1_KDETH_BTH_SEQ_MASK));
+ ohdr->u.tid_rdma.w_rsp.tid_flow_qp =
+ cpu_to_be32(qpriv->tid_rdma.local.qp |
+ ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+ TID_RDMA_DESTQP_FLOW_SHIFT) |
+ qpriv->rcd->ctxt);
+ ohdr->u.tid_rdma.w_rsp.verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 = remote->qp;
+ rcu_read_unlock();
+ hdwords = sizeof(ohdr->u.tid_rdma.w_rsp) / sizeof(u32);
+ qpriv->pending_tid_w_segs++;
+done:
+ return hdwords;
+}
+
+static void hfi1_add_tid_reap_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ if (!(qpriv->s_flags & HFI1_R_TID_RSC_TIMER)) {
+ qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
+ qpriv->s_tid_timer.expires = jiffies +
+ qpriv->tid_timer_timeout_jiffies;
+ add_timer(&qpriv->s_tid_timer);
+ }
+}
+
+static void hfi1_mod_tid_reap_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ lockdep_assert_held(&qp->s_lock);
+ qpriv->s_flags |= HFI1_R_TID_RSC_TIMER;
+ mod_timer(&qpriv->s_tid_timer, jiffies +
+ qpriv->tid_timer_timeout_jiffies);
+}
+
+static int hfi1_stop_tid_reap_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ int rval = 0;
+
+ lockdep_assert_held(&qp->s_lock);
+ if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
+ rval = del_timer(&qpriv->s_tid_timer);
+ qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
+ }
+ return rval;
+}
+
+void hfi1_del_tid_reap_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+
+ del_timer_sync(&qpriv->s_tid_timer);
+ qpriv->s_flags &= ~HFI1_R_TID_RSC_TIMER;
+}
+
+static void hfi1_tid_timeout(struct timer_list *t)
+{
+ struct hfi1_qp_priv *qpriv = from_timer(qpriv, t, s_tid_timer);
+ struct rvt_qp *qp = qpriv->owner;
+ struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device);
+ unsigned long flags;
+ u32 i;
+
+ spin_lock_irqsave(&qp->r_lock, flags);
+ spin_lock(&qp->s_lock);
+ if (qpriv->s_flags & HFI1_R_TID_RSC_TIMER) {
+ dd_dev_warn(dd_from_ibdev(qp->ibqp.device), "[QP%u] %s %d\n",
+ qp->ibqp.qp_num, __func__, __LINE__);
+ trace_hfi1_msg_tid_timeout(/* msg */
+ qp, "resource timeout = ",
+ (u64)qpriv->tid_timer_timeout_jiffies);
+ hfi1_stop_tid_reap_timer(qp);
+ /*
+ * Go though the entire ack queue and clear any outstanding
+ * HW flow and RcvArray resources.
+ */
+ hfi1_kern_clear_hw_flow(qpriv->rcd, qp);
+ for (i = 0; i < rvt_max_atomic(rdi); i++) {
+ struct tid_rdma_request *req =
+ ack_to_tid_req(&qp->s_ack_queue[i]);
+
+ hfi1_kern_exp_rcv_clear_all(req);
+ }
+ spin_unlock(&qp->s_lock);
+ if (qp->ibqp.event_handler) {
+ struct ib_event ev;
+
+ ev.device = qp->ibqp.device;
+ ev.element.qp = &qp->ibqp;
+ ev.event = IB_EVENT_QP_FATAL;
+ qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+ }
+ rvt_rc_error(qp, IB_WC_RESP_TIMEOUT_ERR);
+ goto unlock_r_lock;
+ }
+ spin_unlock(&qp->s_lock);
+unlock_r_lock:
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
+{
+ /* HANDLER FOR TID RDMA WRITE RESPONSE packet (Requestor side */
+
+ /*
+ * 1. Find matching SWQE
+ * 2. Check that TIDENTRY array has enough space for a complete
+ * segment. If not, put QP in error state.
+ * 3. Save response data in struct tid_rdma_req and struct tid_rdma_flow
+ * 4. Remove HFI1_S_WAIT_TID_RESP from s_flags.
+ * 5. Set qp->s_state
+ * 6. Kick the send engine (hfi1_schedule_send())
+ */
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ enum ib_wc_status status;
+ u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen;
+ bool fecn;
+ unsigned long flags;
+
+ fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth);
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ /* Ignore invalid responses */
+ if (cmp_psn(psn, qp->s_next_psn) >= 0)
+ goto ack_done;
+
+ /* Ignore duplicate responses. */
+ if (unlikely(cmp_psn(psn, qp->s_last_psn) <= 0))
+ goto ack_done;
+
+ if (unlikely(qp->s_acked == qp->s_tail))
+ goto ack_done;
+
+ /*
+ * If we are waiting for a particular packet sequence number
+ * due to a request being resent, check for it. Otherwise,
+ * ensure that we haven't missed anything.
+ */
+ if (qp->r_flags & RVT_R_RDMAR_SEQ) {
+ if (cmp_psn(psn, qp->s_last_psn + 1) != 0)
+ goto ack_done;
+ qp->r_flags &= ~RVT_R_RDMAR_SEQ;
+ }
+
+ wqe = rvt_get_swqe_ptr(qp, qpriv->s_tid_cur);
+ if (unlikely(wqe->wr.opcode != IB_WR_TID_RDMA_WRITE))
+ goto ack_op_err;
+
+ req = wqe_to_tid_req(wqe);
+ /*
+ * If we've lost ACKs and our acked_tail pointer is too far
+ * behind, don't overwrite segments. Just drop the packet and
+ * let the reliability protocol take care of it.
+ */
+ if (!CIRC_SPACE(req->setup_head, req->acked_tail, MAX_FLOWS))
+ goto ack_done;
+
+ /*
+ * The call to do_rc_ack() should be last in the chain of
+ * packet checks because it will end up updating the QP state.
+ * Therefore, anything that would prevent the packet from
+ * being accepted as a successful response should be prior
+ * to it.
+ */
+ if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
+ goto ack_done;
+
+ trace_hfi1_ack(qp, psn);
+
+ flow = &req->flows[req->setup_head];
+ flow->pkt = 0;
+ flow->tid_idx = 0;
+ flow->tid_offset = 0;
+ flow->sent = 0;
+ flow->resync_npkts = 0;
+ flow->tid_qpn = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_qp);
+ flow->idx = (flow->tid_qpn >> TID_RDMA_DESTQP_FLOW_SHIFT) &
+ TID_RDMA_DESTQP_FLOW_MASK;
+ flow_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.w_rsp.tid_flow_psn));
+ flow->flow_state.generation = flow_psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+ flow->flow_state.spsn = flow_psn & HFI1_KDETH_BTH_SEQ_MASK;
+ flow->flow_state.resp_ib_psn = psn;
+ flow->length = min_t(u32, req->seg_len,
+ (wqe->length - (req->comp_seg * req->seg_len)));
+
+ flow->npkts = rvt_div_round_up_mtu(qp, flow->length);
+ flow->flow_state.lpsn = flow->flow_state.spsn +
+ flow->npkts - 1;
+ /* payload length = packet length - (header length + ICRC length) */
+ pktlen = packet->tlen - (packet->hlen + 4);
+ if (pktlen > sizeof(flow->tid_entry)) {
+ status = IB_WC_LOC_LEN_ERR;
+ goto ack_err;
+ }
+ memcpy(flow->tid_entry, packet->ebuf, pktlen);
+ flow->tidcnt = pktlen / sizeof(*flow->tid_entry);
+ trace_hfi1_tid_flow_rcv_write_resp(qp, req->setup_head, flow);
+
+ req->comp_seg++;
+ trace_hfi1_tid_write_sender_rcv_resp(qp, 0);
+ /*
+ * Walk the TID_ENTRY list to make sure we have enough space for a
+ * complete segment.
+ */
+ for (i = 0; i < flow->tidcnt; i++) {
+ trace_hfi1_tid_entry_rcv_write_resp(/* entry */
+ qp, i, flow->tid_entry[i]);
+ if (!EXP_TID_GET(flow->tid_entry[i], LEN)) {
+ status = IB_WC_LOC_LEN_ERR;
+ goto ack_err;
+ }
+ tidlen += EXP_TID_GET(flow->tid_entry[i], LEN);
+ }
+ if (tidlen * PAGE_SIZE < flow->length) {
+ status = IB_WC_LOC_LEN_ERR;
+ goto ack_err;
+ }
+
+ trace_hfi1_tid_req_rcv_write_resp(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ /*
+ * If this is the first response for this request, set the initial
+ * flow index to the current flow.
+ */
+ if (!cmp_psn(psn, wqe->psn)) {
+ req->r_last_acked = mask_psn(wqe->psn - 1);
+ /* Set acked flow index to head index */
+ req->acked_tail = req->setup_head;
+ }
+
+ /* advance circular buffer head */
+ req->setup_head = CIRC_NEXT(req->setup_head, MAX_FLOWS);
+ req->state = TID_REQUEST_ACTIVE;
+
+ /*
+ * If all responses for this TID RDMA WRITE request have been received
+ * advance the pointer to the next one.
+ * Since TID RDMA requests could be mixed in with regular IB requests,
+ * they might not appear sequentially in the queue. Therefore, the
+ * next request needs to be "found".
+ */
+ if (qpriv->s_tid_cur != qpriv->s_tid_head &&
+ req->comp_seg == req->total_segs) {
+ for (i = qpriv->s_tid_cur + 1; ; i++) {
+ if (i == qp->s_size)
+ i = 0;
+ wqe = rvt_get_swqe_ptr(qp, i);
+ if (i == qpriv->s_tid_head)
+ break;
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+ break;
+ }
+ qpriv->s_tid_cur = i;
+ }
+ qp->s_flags &= ~HFI1_S_WAIT_TID_RESP;
+ hfi1_schedule_tid_send(qp);
+ goto ack_done;
+
+ack_op_err:
+ status = IB_WC_LOC_QP_OP_ERR;
+ack_err:
+ rvt_error_qp(qp, status);
+ack_done:
+ if (fecn)
+ qp->s_flags |= RVT_S_ECN;
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len)
+{
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = &req->flows[req->clear_tail];
+ struct tid_rdma_params *remote;
+ struct rvt_qp *qp = req->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ u32 tidentry = flow->tid_entry[flow->tid_idx];
+ u32 tidlen = EXP_TID_GET(tidentry, LEN) << PAGE_SHIFT;
+ struct tid_rdma_write_data *wd = &ohdr->u.tid_rdma.w_data;
+ u32 next_offset, om = KDETH_OM_LARGE;
+ bool last_pkt;
+
+ if (!tidlen) {
+ hfi1_trdma_send_complete(qp, wqe, IB_WC_REM_INV_RD_REQ_ERR);
+ rvt_error_qp(qp, IB_WC_REM_INV_RD_REQ_ERR);
+ }
+
+ *len = min_t(u32, qp->pmtu, tidlen - flow->tid_offset);
+ flow->sent += *len;
+ next_offset = flow->tid_offset + *len;
+ last_pkt = (flow->tid_idx == (flow->tidcnt - 1) &&
+ next_offset >= tidlen) || (flow->sent >= flow->length);
+ trace_hfi1_tid_entry_build_write_data(qp, flow->tid_idx, tidentry);
+ trace_hfi1_tid_flow_build_write_data(qp, req->clear_tail, flow);
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ KDETH_RESET(wd->kdeth0, KVER, 0x1);
+ KDETH_SET(wd->kdeth0, SH, !last_pkt);
+ KDETH_SET(wd->kdeth0, INTR, !!(!last_pkt && remote->urg));
+ KDETH_SET(wd->kdeth0, TIDCTRL, EXP_TID_GET(tidentry, CTRL));
+ KDETH_SET(wd->kdeth0, TID, EXP_TID_GET(tidentry, IDX));
+ KDETH_SET(wd->kdeth0, OM, om == KDETH_OM_LARGE);
+ KDETH_SET(wd->kdeth0, OFFSET, flow->tid_offset / om);
+ KDETH_RESET(wd->kdeth1, JKEY, remote->jkey);
+ wd->verbs_qp = cpu_to_be32(qp->remote_qpn);
+ rcu_read_unlock();
+
+ *bth1 = flow->tid_qpn;
+ *bth2 = mask_psn(((flow->flow_state.spsn + flow->pkt++) &
+ HFI1_KDETH_BTH_SEQ_MASK) |
+ (flow->flow_state.generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT));
+ if (last_pkt) {
+ /* PSNs are zero-based, so +1 to count number of packets */
+ if (flow->flow_state.lpsn + 1 +
+ rvt_div_round_up_mtu(qp, req->seg_len) >
+ MAX_TID_FLOW_PSN)
+ req->state = TID_REQUEST_SYNC;
+ *bth2 |= IB_BTH_REQ_ACK;
+ }
+
+ if (next_offset >= tidlen) {
+ flow->tid_offset = 0;
+ flow->tid_idx++;
+ } else {
+ flow->tid_offset = next_offset;
+ }
+ return last_pkt;
+}
+
+void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
+{
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ctxtdata *rcd = priv->rcd;
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ unsigned long flags;
+ u32 psn, next;
+ u8 opcode;
+ bool fecn;
+
+ fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
+
+ /*
+ * All error handling should be done by now. If we are here, the packet
+ * is either good or been accepted by the error handler.
+ */
+ spin_lock_irqsave(&qp->s_lock, flags);
+ e = &qp->s_ack_queue[priv->r_tid_tail];
+ req = ack_to_tid_req(e);
+ flow = &req->flows[req->clear_tail];
+ if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) {
+ update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
+
+ if (cmp_psn(psn, flow->flow_state.r_next_psn))
+ goto send_nak;
+
+ flow->flow_state.r_next_psn = mask_psn(psn + 1);
+ /*
+ * Copy the payload to destination buffer if this packet is
+ * delivered as an eager packet due to RSM rule and FECN.
+ * The RSM rule selects FECN bit in BTH and SH bit in
+ * KDETH header and therefore will not match the last
+ * packet of each segment that has SH bit cleared.
+ */
+ if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
+ struct rvt_sge_state ss;
+ u32 len;
+ u32 tlen = packet->tlen;
+ u16 hdrsize = packet->hlen;
+ u8 pad = packet->pad;
+ u8 extra_bytes = pad + packet->extra_byte +
+ (SIZE_OF_CRC << 2);
+ u32 pmtu = qp->pmtu;
+
+ if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
+ goto send_nak;
+ len = req->comp_seg * req->seg_len;
+ len += delta_psn(psn,
+ full_flow_psn(flow, flow->flow_state.spsn)) *
+ pmtu;
+ if (unlikely(req->total_len - len < pmtu))
+ goto send_nak;
+
+ /*
+ * The e->rdma_sge field is set when TID RDMA WRITE REQ
+ * is first received and is never modified thereafter.
+ */
+ ss.sge = e->rdma_sge;
+ ss.sg_list = NULL;
+ ss.num_sge = 1;
+ ss.total_len = req->total_len;
+ rvt_skip_sge(&ss, len, false);
+ rvt_copy_sge(qp, &ss, packet->payload, pmtu, false,
+ false);
+ /* Raise the sw sequence check flag for next packet */
+ priv->r_next_psn_kdeth = mask_psn(psn + 1);
+ priv->s_flags |= HFI1_R_TID_SW_PSN;
+ }
+ goto exit;
+ }
+ flow->flow_state.r_next_psn = mask_psn(psn + 1);
+ hfi1_kern_exp_rcv_clear(req);
+ priv->alloc_w_segs--;
+ rcd->flows[flow->idx].psn = psn & HFI1_KDETH_BTH_SEQ_MASK;
+ req->comp_seg++;
+ priv->s_nak_state = 0;
+
+ /*
+ * Release the flow if one of the following conditions has been met:
+ * - The request has reached a sync point AND all outstanding
+ * segments have been completed, or
+ * - The entire request is complete and there are no more requests
+ * (of any kind) in the queue.
+ */
+ trace_hfi1_rsp_rcv_tid_write_data(qp, psn);
+ trace_hfi1_tid_req_rcv_write_data(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ trace_hfi1_tid_write_rsp_rcv_data(qp);
+ validate_r_tid_ack(priv);
+
+ if (opcode == TID_OP(WRITE_DATA_LAST)) {
+ release_rdma_sge_mr(e);
+ for (next = priv->r_tid_tail + 1; ; next++) {
+ if (next > rvt_size_atomic(&dev->rdi))
+ next = 0;
+ if (next == priv->r_tid_head)
+ break;
+ e = &qp->s_ack_queue[next];
+ if (e->opcode == TID_OP(WRITE_REQ))
+ break;
+ }
+ priv->r_tid_tail = next;
+ if (++qp->s_acked_ack_queue > rvt_size_atomic(&dev->rdi))
+ qp->s_acked_ack_queue = 0;
+ }
+
+ hfi1_tid_write_alloc_resources(qp, true);
+
+ /*
+ * If we need to generate more responses, schedule the
+ * send engine.
+ */
+ if (req->cur_seg < req->total_segs ||
+ qp->s_tail_ack_queue != qp->r_head_ack_queue) {
+ qp->s_flags |= RVT_S_RESP_PENDING;
+ hfi1_schedule_send(qp);
+ }
+
+ priv->pending_tid_w_segs--;
+ if (priv->s_flags & HFI1_R_TID_RSC_TIMER) {
+ if (priv->pending_tid_w_segs)
+ hfi1_mod_tid_reap_timer(req->qp);
+ else
+ hfi1_stop_tid_reap_timer(req->qp);
+ }
+
+done:
+ tid_rdma_schedule_ack(qp);
+exit:
+ priv->r_next_psn_kdeth = flow->flow_state.r_next_psn;
+ if (fecn)
+ qp->s_flags |= RVT_S_ECN;
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ return;
+
+send_nak:
+ if (!priv->s_nak_state) {
+ priv->s_nak_state = IB_NAK_PSN_ERROR;
+ priv->s_nak_psn = flow->flow_state.r_next_psn;
+ tid_rdma_trigger_ack(qp);
+ }
+ goto done;
+}
+
+static bool hfi1_tid_rdma_is_resync_psn(u32 psn)
+{
+ return (bool)((psn & HFI1_KDETH_BTH_SEQ_MASK) ==
+ HFI1_KDETH_BTH_SEQ_MASK);
+}
+
+u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u16 iflow,
+ u32 *bth1, u32 *bth2)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_flow_state *fs = &qpriv->flow_state;
+ struct tid_rdma_request *req = ack_to_tid_req(e);
+ struct tid_rdma_flow *flow = &req->flows[iflow];
+ struct tid_rdma_params *remote;
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
+ ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 = remote->qp;
+ rcu_read_unlock();
+
+ if (qpriv->resync) {
+ *bth2 = mask_psn((fs->generation <<
+ HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
+ ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
+ } else if (qpriv->s_nak_state) {
+ *bth2 = mask_psn(qpriv->s_nak_psn);
+ ohdr->u.tid_rdma.ack.aeth =
+ cpu_to_be32((qp->r_msn & IB_MSN_MASK) |
+ (qpriv->s_nak_state <<
+ IB_AETH_CREDIT_SHIFT));
+ } else {
+ *bth2 = full_flow_psn(flow, flow->flow_state.lpsn);
+ ohdr->u.tid_rdma.ack.aeth = rvt_compute_aeth(qp);
+ }
+ KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
+ ohdr->u.tid_rdma.ack.tid_flow_qp =
+ cpu_to_be32(qpriv->tid_rdma.local.qp |
+ ((flow->idx & TID_RDMA_DESTQP_FLOW_MASK) <<
+ TID_RDMA_DESTQP_FLOW_SHIFT) |
+ qpriv->rcd->ctxt);
+
+ ohdr->u.tid_rdma.ack.tid_flow_psn = 0;
+ ohdr->u.tid_rdma.ack.verbs_psn =
+ cpu_to_be32(flow->flow_state.resp_ib_psn);
+
+ if (qpriv->resync) {
+ /*
+ * If the PSN before the current expect KDETH PSN is the
+ * RESYNC PSN, then we never received a good TID RDMA WRITE
+ * DATA packet after a previous RESYNC.
+ * In this case, the next expected KDETH PSN stays the same.
+ */
+ if (hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1)) {
+ ohdr->u.tid_rdma.ack.tid_flow_psn =
+ cpu_to_be32(qpriv->r_next_psn_kdeth_save);
+ } else {
+ /*
+ * Because the KDETH PSNs jump during a RESYNC, it's
+ * not possible to infer (or compute) the previous value
+ * of r_next_psn_kdeth in the case of back-to-back
+ * RESYNC packets. Therefore, we save it.
+ */
+ qpriv->r_next_psn_kdeth_save =
+ qpriv->r_next_psn_kdeth - 1;
+ ohdr->u.tid_rdma.ack.tid_flow_psn =
+ cpu_to_be32(qpriv->r_next_psn_kdeth_save);
+ qpriv->r_next_psn_kdeth = mask_psn(*bth2 + 1);
+ }
+ qpriv->resync = false;
+ }
+
+ return sizeof(ohdr->u.tid_rdma.ack) / sizeof(u32);
+}
+
+void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
+{
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct rvt_swqe *wqe;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ u32 aeth, psn, req_psn, ack_psn, flpsn, resync_psn, ack_kpsn;
+ unsigned long flags;
+ u16 fidx;
+
+ trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0);
+ process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth);
+ req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn));
+ resync_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.tid_flow_psn));
+
+ spin_lock_irqsave(&qp->s_lock, flags);
+ trace_hfi1_rcv_tid_ack(qp, aeth, psn, req_psn, resync_psn);
+
+ /* If we are waiting for an ACK to RESYNC, drop any other packets */
+ if ((qp->s_flags & HFI1_S_WAIT_HALT) &&
+ cmp_psn(psn, qpriv->s_resync_psn))
+ goto ack_op_err;
+
+ ack_psn = req_psn;
+ if (hfi1_tid_rdma_is_resync_psn(psn))
+ ack_kpsn = resync_psn;
+ else
+ ack_kpsn = psn;
+ if (aeth >> 29) {
+ ack_psn--;
+ ack_kpsn--;
+ }
+
+ if (unlikely(qp->s_acked == qp->s_tail))
+ goto ack_op_err;
+
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+ goto ack_op_err;
+
+ req = wqe_to_tid_req(wqe);
+ trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ flow = &req->flows[req->acked_tail];
+ trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
+
+ /* Drop stale ACK/NAK */
+ if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.spsn)) < 0 ||
+ cmp_psn(req_psn, flow->flow_state.resp_ib_psn) < 0)
+ goto ack_op_err;
+
+ while (cmp_psn(ack_kpsn,
+ full_flow_psn(flow, flow->flow_state.lpsn)) >= 0 &&
+ req->ack_seg < req->cur_seg) {
+ req->ack_seg++;
+ /* advance acked segment pointer */
+ req->acked_tail = CIRC_NEXT(req->acked_tail, MAX_FLOWS);
+ req->r_last_acked = flow->flow_state.resp_ib_psn;
+ trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ if (req->ack_seg == req->total_segs) {
+ req->state = TID_REQUEST_COMPLETE;
+ wqe = do_rc_completion(qp, wqe,
+ to_iport(qp->ibqp.device,
+ qp->port_num));
+ trace_hfi1_sender_rcv_tid_ack(qp);
+ atomic_dec(&qpriv->n_tid_requests);
+ if (qp->s_acked == qp->s_tail)
+ break;
+ if (wqe->wr.opcode != IB_WR_TID_RDMA_WRITE)
+ break;
+ req = wqe_to_tid_req(wqe);
+ }
+ flow = &req->flows[req->acked_tail];
+ trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail, flow);
+ }
+
+ trace_hfi1_tid_req_rcv_tid_ack(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ switch (aeth >> 29) {
+ case 0: /* ACK */
+ if (qpriv->s_flags & RVT_S_WAIT_ACK)
+ qpriv->s_flags &= ~RVT_S_WAIT_ACK;
+ if (!hfi1_tid_rdma_is_resync_psn(psn)) {
+ /* Check if there is any pending TID ACK */
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE &&
+ req->ack_seg < req->cur_seg)
+ hfi1_mod_tid_retry_timer(qp);
+ else
+ hfi1_stop_tid_retry_timer(qp);
+ hfi1_schedule_send(qp);
+ } else {
+ u32 spsn, fpsn, last_acked, generation;
+ struct tid_rdma_request *rptr;
+
+ /* ACK(RESYNC) */
+ hfi1_stop_tid_retry_timer(qp);
+ /* Allow new requests (see hfi1_make_tid_rdma_pkt) */
+ qp->s_flags &= ~HFI1_S_WAIT_HALT;
+ /*
+ * Clear RVT_S_SEND_ONE flag in case that the TID RDMA
+ * ACK is received after the TID retry timer is fired
+ * again. In this case, do not send any more TID
+ * RESYNC request or wait for any more TID ACK packet.
+ */
+ qpriv->s_flags &= ~RVT_S_SEND_ONE;
+ hfi1_schedule_send(qp);
+
+ if ((qp->s_acked == qpriv->s_tid_tail &&
+ req->ack_seg == req->total_segs) ||
+ qp->s_acked == qp->s_tail) {
+ qpriv->s_state = TID_OP(WRITE_DATA_LAST);
+ goto done;
+ }
+
+ if (req->ack_seg == req->comp_seg) {
+ qpriv->s_state = TID_OP(WRITE_DATA);
+ goto done;
+ }
+
+ /*
+ * The PSN to start with is the next PSN after the
+ * RESYNC PSN.
+ */
+ psn = mask_psn(psn + 1);
+ generation = psn >> HFI1_KDETH_BTH_SEQ_SHIFT;
+ spsn = 0;
+
+ /*
+ * Update to the correct WQE when we get an ACK(RESYNC)
+ * in the middle of a request.
+ */
+ if (delta_psn(ack_psn, wqe->lpsn))
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ req = wqe_to_tid_req(wqe);
+ flow = &req->flows[req->acked_tail];
+ /*
+ * RESYNC re-numbers the PSN ranges of all remaining
+ * segments. Also, PSN's start from 0 in the middle of a
+ * segment and the first segment size is less than the
+ * default number of packets. flow->resync_npkts is used
+ * to track the number of packets from the start of the
+ * real segment to the point of 0 PSN after the RESYNC
+ * in order to later correctly rewind the SGE.
+ */
+ fpsn = full_flow_psn(flow, flow->flow_state.spsn);
+ req->r_ack_psn = psn;
+ flow->resync_npkts +=
+ delta_psn(mask_psn(resync_psn + 1), fpsn);
+ /*
+ * Renumber all packet sequence number ranges
+ * based on the new generation.
+ */
+ last_acked = qp->s_acked;
+ rptr = req;
+ while (1) {
+ /* start from last acked segment */
+ for (fidx = rptr->acked_tail;
+ CIRC_CNT(rptr->setup_head, fidx,
+ MAX_FLOWS);
+ fidx = CIRC_NEXT(fidx, MAX_FLOWS)) {
+ u32 lpsn;
+ u32 gen;
+
+ flow = &rptr->flows[fidx];
+ gen = flow->flow_state.generation;
+ if (WARN_ON(gen == generation &&
+ flow->flow_state.spsn !=
+ spsn))
+ continue;
+ lpsn = flow->flow_state.lpsn;
+ lpsn = full_flow_psn(flow, lpsn);
+ flow->npkts =
+ delta_psn(lpsn,
+ mask_psn(resync_psn)
+ );
+ flow->flow_state.generation =
+ generation;
+ flow->flow_state.spsn = spsn;
+ flow->flow_state.lpsn =
+ flow->flow_state.spsn +
+ flow->npkts - 1;
+ flow->pkt = 0;
+ spsn += flow->npkts;
+ resync_psn += flow->npkts;
+ trace_hfi1_tid_flow_rcv_tid_ack(qp,
+ fidx,
+ flow);
+ }
+ if (++last_acked == qpriv->s_tid_cur + 1)
+ break;
+ if (last_acked == qp->s_size)
+ last_acked = 0;
+ wqe = rvt_get_swqe_ptr(qp, last_acked);
+ rptr = wqe_to_tid_req(wqe);
+ }
+ req->cur_seg = req->ack_seg;
+ qpriv->s_tid_tail = qp->s_acked;
+ qpriv->s_state = TID_OP(WRITE_REQ);
+ hfi1_schedule_tid_send(qp);
+ }
+done:
+ qpriv->s_retry = qp->s_retry_cnt;
+ break;
+
+ case 3: /* NAK */
+ hfi1_stop_tid_retry_timer(qp);
+ switch ((aeth >> IB_AETH_CREDIT_SHIFT) &
+ IB_AETH_CREDIT_MASK) {
+ case 0: /* PSN sequence error */
+ if (!req->flows)
+ break;
+ flow = &req->flows[req->acked_tail];
+ flpsn = full_flow_psn(flow, flow->flow_state.lpsn);
+ if (cmp_psn(psn, flpsn) > 0)
+ break;
+ trace_hfi1_tid_flow_rcv_tid_ack(qp, req->acked_tail,
+ flow);
+ req->r_ack_psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+ req->cur_seg = req->ack_seg;
+ qpriv->s_tid_tail = qp->s_acked;
+ qpriv->s_state = TID_OP(WRITE_REQ);
+ qpriv->s_retry = qp->s_retry_cnt;
+ hfi1_schedule_tid_send(qp);
+ break;
+
+ default:
+ break;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ack_op_err:
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+void hfi1_add_tid_retry_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+ lockdep_assert_held(&qp->s_lock);
+ if (!(priv->s_flags & HFI1_S_TID_RETRY_TIMER)) {
+ priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
+ priv->s_tid_retry_timer.expires = jiffies +
+ priv->tid_retry_timeout_jiffies + rdi->busy_jiffies;
+ add_timer(&priv->s_tid_retry_timer);
+ }
+}
+
+static void hfi1_mod_tid_retry_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+ lockdep_assert_held(&qp->s_lock);
+ priv->s_flags |= HFI1_S_TID_RETRY_TIMER;
+ mod_timer(&priv->s_tid_retry_timer, jiffies +
+ priv->tid_retry_timeout_jiffies + rdi->busy_jiffies);
+}
+
+static int hfi1_stop_tid_retry_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ int rval = 0;
+
+ lockdep_assert_held(&qp->s_lock);
+ if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
+ rval = del_timer(&priv->s_tid_retry_timer);
+ priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
+ }
+ return rval;
+}
+
+void hfi1_del_tid_retry_timer(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ del_timer_sync(&priv->s_tid_retry_timer);
+ priv->s_flags &= ~HFI1_S_TID_RETRY_TIMER;
+}
+
+static void hfi1_tid_retry_timeout(struct timer_list *t)
+{
+ struct hfi1_qp_priv *priv = from_timer(priv, t, s_tid_retry_timer);
+ struct rvt_qp *qp = priv->owner;
+ struct rvt_swqe *wqe;
+ unsigned long flags;
+ struct tid_rdma_request *req;
+
+ spin_lock_irqsave(&qp->r_lock, flags);
+ spin_lock(&qp->s_lock);
+ trace_hfi1_tid_write_sender_retry_timeout(qp, 0);
+ if (priv->s_flags & HFI1_S_TID_RETRY_TIMER) {
+ hfi1_stop_tid_retry_timer(qp);
+ if (!priv->s_retry) {
+ trace_hfi1_msg_tid_retry_timeout(/* msg */
+ qp,
+ "Exhausted retries. Tid retry timeout = ",
+ (u64)priv->tid_retry_timeout_jiffies);
+
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ hfi1_trdma_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+ rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ } else {
+ wqe = rvt_get_swqe_ptr(qp, qp->s_acked);
+ req = wqe_to_tid_req(wqe);
+ trace_hfi1_tid_req_tid_retry_timeout(/* req */
+ qp, 0, wqe->wr.opcode, wqe->psn, wqe->lpsn, req);
+
+ priv->s_flags &= ~RVT_S_WAIT_ACK;
+ /* Only send one packet (the RESYNC) */
+ priv->s_flags |= RVT_S_SEND_ONE;
+ /*
+ * No additional request shall be made by this QP until
+ * the RESYNC has been complete.
+ */
+ qp->s_flags |= HFI1_S_WAIT_HALT;
+ priv->s_state = TID_OP(RESYNC);
+ priv->s_retry--;
+ hfi1_schedule_tid_send(qp);
+ }
+ }
+ spin_unlock(&qp->s_lock);
+ spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u16 fidx)
+{
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct tid_rdma_params *remote;
+ struct tid_rdma_request *req = wqe_to_tid_req(wqe);
+ struct tid_rdma_flow *flow = &req->flows[fidx];
+ u32 generation;
+
+ rcu_read_lock();
+ remote = rcu_dereference(qpriv->tid_rdma.remote);
+ KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth1, JKEY, remote->jkey);
+ ohdr->u.tid_rdma.ack.verbs_qp = cpu_to_be32(qp->remote_qpn);
+ *bth1 = remote->qp;
+ rcu_read_unlock();
+
+ generation = kern_flow_generation_next(flow->flow_state.generation);
+ *bth2 = mask_psn((generation << HFI1_KDETH_BTH_SEQ_SHIFT) - 1);
+ qpriv->s_resync_psn = *bth2;
+ *bth2 |= IB_BTH_REQ_ACK;
+ KDETH_RESET(ohdr->u.tid_rdma.ack.kdeth0, KVER, 0x1);
+
+ return sizeof(ohdr->u.tid_rdma.resync) / sizeof(u32);
+}
+
+void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
+{
+ struct ib_other_headers *ohdr = packet->ohdr;
+ struct rvt_qp *qp = packet->qp;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_ctxtdata *rcd = qpriv->rcd;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ struct rvt_ack_entry *e;
+ struct tid_rdma_request *req;
+ struct tid_rdma_flow *flow;
+ struct tid_flow_state *fs = &qpriv->flow_state;
+ u32 psn, generation, idx, gen_next;
+ bool fecn;
+ unsigned long flags;
+
+ fecn = process_ecn(qp, packet);
+ psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
+
+ generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT;
+ spin_lock_irqsave(&qp->s_lock, flags);
+
+ gen_next = (fs->generation == KERN_GENERATION_RESERVED) ?
+ generation : kern_flow_generation_next(fs->generation);
+ /*
+ * RESYNC packet contains the "next" generation and can only be
+ * from the current or previous generations
+ */
+ if (generation != mask_generation(gen_next - 1) &&
+ generation != gen_next)
+ goto bail;
+ /* Already processing a resync */
+ if (qpriv->resync)
+ goto bail;
+
+ spin_lock(&rcd->exp_lock);
+ if (fs->index >= RXE_NUM_TID_FLOWS) {
+ /*
+ * If we don't have a flow, save the generation so it can be
+ * applied when a new flow is allocated
+ */
+ fs->generation = generation;
+ } else {
+ /* Reprogram the QP flow with new generation */
+ rcd->flows[fs->index].generation = generation;
+ fs->generation = kern_setup_hw_flow(rcd, fs->index);
+ }
+ fs->psn = 0;
+ /*
+ * Disable SW PSN checking since a RESYNC is equivalent to a
+ * sync point and the flow has/will be reprogrammed
+ */
+ qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+ trace_hfi1_tid_write_rsp_rcv_resync(qp);
+
+ /*
+ * Reset all TID flow information with the new generation.
+ * This is done for all requests and segments after the
+ * last received segment
+ */
+ for (idx = qpriv->r_tid_tail; ; idx++) {
+ u16 flow_idx;
+
+ if (idx > rvt_size_atomic(&dev->rdi))
+ idx = 0;
+ e = &qp->s_ack_queue[idx];
+ if (e->opcode == TID_OP(WRITE_REQ)) {
+ req = ack_to_tid_req(e);
+ trace_hfi1_tid_req_rcv_resync(qp, 0, e->opcode, e->psn,
+ e->lpsn, req);
+
+ /* start from last unacked segment */
+ for (flow_idx = req->clear_tail;
+ CIRC_CNT(req->setup_head, flow_idx,
+ MAX_FLOWS);
+ flow_idx = CIRC_NEXT(flow_idx, MAX_FLOWS)) {
+ u32 lpsn;
+ u32 next;
+
+ flow = &req->flows[flow_idx];
+ lpsn = full_flow_psn(flow,
+ flow->flow_state.lpsn);
+ next = flow->flow_state.r_next_psn;
+ flow->npkts = delta_psn(lpsn, next - 1);
+ flow->flow_state.generation = fs->generation;
+ flow->flow_state.spsn = fs->psn;
+ flow->flow_state.lpsn =
+ flow->flow_state.spsn + flow->npkts - 1;
+ flow->flow_state.r_next_psn =
+ full_flow_psn(flow,
+ flow->flow_state.spsn);
+ fs->psn += flow->npkts;
+ trace_hfi1_tid_flow_rcv_resync(qp, flow_idx,
+ flow);
+ }
+ }
+ if (idx == qp->s_tail_ack_queue)
+ break;
+ }
+
+ spin_unlock(&rcd->exp_lock);
+ qpriv->resync = true;
+ /* RESYNC request always gets a TID RDMA ACK. */
+ qpriv->s_nak_state = 0;
+ tid_rdma_trigger_ack(qp);
+bail:
+ if (fecn)
+ qp->s_flags |= RVT_S_ECN;
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/*
+ * Call this function when the last TID RDMA WRITE DATA packet for a request
+ * is built.
+ */
+static void update_tid_tail(struct rvt_qp *qp)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ u32 i;
+ struct rvt_swqe *wqe;
+
+ lockdep_assert_held(&qp->s_lock);
+ /* Can't move beyond s_tid_cur */
+ if (priv->s_tid_tail == priv->s_tid_cur)
+ return;
+ for (i = priv->s_tid_tail + 1; ; i++) {
+ if (i == qp->s_size)
+ i = 0;
+
+ if (i == priv->s_tid_cur)
+ break;
+ wqe = rvt_get_swqe_ptr(qp, i);
+ if (wqe->wr.opcode == IB_WR_TID_RDMA_WRITE)
+ break;
+ }
+ priv->s_tid_tail = i;
+ priv->s_state = TID_OP(WRITE_RESP);
+}
+
+int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps)
+ __must_hold(&qp->s_lock)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct rvt_swqe *wqe;
+ u32 bth1 = 0, bth2 = 0, hwords = 5, len, middle = 0;
+ struct ib_other_headers *ohdr;
+ struct rvt_sge_state *ss = &qp->s_sge;
+ struct rvt_ack_entry *e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+ struct tid_rdma_request *req = ack_to_tid_req(e);
+ bool last = false;
+ u8 opcode = TID_OP(WRITE_DATA);
+
+ lockdep_assert_held(&qp->s_lock);
+ trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
+ /*
+ * Prioritize the sending of the requests and responses over the
+ * sending of the TID RDMA data packets.
+ */
+ if (((atomic_read(&priv->n_tid_requests) < HFI1_TID_RDMA_WRITE_CNT) &&
+ atomic_read(&priv->n_requests) &&
+ !(qp->s_flags & (RVT_S_BUSY | RVT_S_WAIT_ACK |
+ HFI1_S_ANY_WAIT_IO))) ||
+ (e->opcode == TID_OP(WRITE_REQ) && req->cur_seg < req->alloc_seg &&
+ !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)))) {
+ struct iowait_work *iowork;
+
+ iowork = iowait_get_ib_work(&priv->s_iowait);
+ ps->s_txreq = get_waiting_verbs_txreq(iowork);
+ if (ps->s_txreq || hfi1_make_rc_req(qp, ps)) {
+ priv->s_flags |= HFI1_S_TID_BUSY_SET;
+ return 1;
+ }
+ }
+
+ ps->s_txreq = get_txreq(ps->dev, qp);
+ if (!ps->s_txreq)
+ goto bail_no_tx;
+
+ ohdr = &ps->s_txreq->phdr.hdr.ibh.u.oth;
+
+ if ((priv->s_flags & RVT_S_ACK_PENDING) &&
+ make_tid_rdma_ack(qp, ohdr, ps))
+ return 1;
+
+ /*
+ * Bail out if we can't send data.
+ * Be reminded that this check must been done after the call to
+ * make_tid_rdma_ack() because the responding QP could be in
+ * RTR state where it can send TID RDMA ACK, not TID RDMA WRITE DATA.
+ */
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK))
+ goto bail;
+
+ if (priv->s_flags & RVT_S_WAIT_ACK)
+ goto bail;
+
+ /* Check whether there is anything to do. */
+ if (priv->s_tid_tail == HFI1_QP_WQE_INVALID)
+ goto bail;
+ wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
+ req = wqe_to_tid_req(wqe);
+ trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode, wqe->psn,
+ wqe->lpsn, req);
+ switch (priv->s_state) {
+ case TID_OP(WRITE_REQ):
+ case TID_OP(WRITE_RESP):
+ priv->tid_ss.sge = wqe->sg_list[0];
+ priv->tid_ss.sg_list = wqe->sg_list + 1;
+ priv->tid_ss.num_sge = wqe->wr.num_sge;
+ priv->tid_ss.total_len = wqe->length;
+
+ if (priv->s_state == TID_OP(WRITE_REQ))
+ hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
+ priv->s_state = TID_OP(WRITE_DATA);
+ /* fall through */
+
+ case TID_OP(WRITE_DATA):
+ /*
+ * 1. Check whether TID RDMA WRITE RESP available.
+ * 2. If no:
+ * 2.1 If have more segments and no TID RDMA WRITE RESP,
+ * set HFI1_S_WAIT_TID_RESP
+ * 2.2 Return indicating no progress made.
+ * 3. If yes:
+ * 3.1 Build TID RDMA WRITE DATA packet.
+ * 3.2 If last packet in segment:
+ * 3.2.1 Change KDETH header bits
+ * 3.2.2 Advance RESP pointers.
+ * 3.3 Return indicating progress made.
+ */
+ trace_hfi1_sender_make_tid_pkt(qp);
+ trace_hfi1_tid_write_sender_make_tid_pkt(qp, 0);
+ wqe = rvt_get_swqe_ptr(qp, priv->s_tid_tail);
+ req = wqe_to_tid_req(wqe);
+ len = wqe->length;
+
+ if (!req->comp_seg || req->cur_seg == req->comp_seg)
+ goto bail;
+
+ trace_hfi1_tid_req_make_tid_pkt(qp, 0, wqe->wr.opcode,
+ wqe->psn, wqe->lpsn, req);
+ last = hfi1_build_tid_rdma_packet(wqe, ohdr, &bth1, &bth2,
+ &len);
+
+ if (last) {
+ /* move pointer to next flow */
+ req->clear_tail = CIRC_NEXT(req->clear_tail,
+ MAX_FLOWS);
+ if (++req->cur_seg < req->total_segs) {
+ if (!CIRC_CNT(req->setup_head, req->clear_tail,
+ MAX_FLOWS))
+ qp->s_flags |= HFI1_S_WAIT_TID_RESP;
+ } else {
+ priv->s_state = TID_OP(WRITE_DATA_LAST);
+ opcode = TID_OP(WRITE_DATA_LAST);
+
+ /* Advance the s_tid_tail now */
+ update_tid_tail(qp);
+ }
+ }
+ hwords += sizeof(ohdr->u.tid_rdma.w_data) / sizeof(u32);
+ ss = &priv->tid_ss;
+ break;
+
+ case TID_OP(RESYNC):
+ trace_hfi1_sender_make_tid_pkt(qp);
+ /* Use generation from the most recently received response */
+ wqe = rvt_get_swqe_ptr(qp, priv->s_tid_cur);
+ req = wqe_to_tid_req(wqe);
+ /* If no responses for this WQE look at the previous one */
+ if (!req->comp_seg) {
+ wqe = rvt_get_swqe_ptr(qp,
+ (!priv->s_tid_cur ? qp->s_size :
+ priv->s_tid_cur) - 1);
+ req = wqe_to_tid_req(wqe);
+ }
+ hwords += hfi1_build_tid_rdma_resync(qp, wqe, ohdr, &bth1,
+ &bth2,
+ CIRC_PREV(req->setup_head,
+ MAX_FLOWS));
+ ss = NULL;
+ len = 0;
+ opcode = TID_OP(RESYNC);
+ break;
+
+ default:
+ goto bail;
+ }
+ if (priv->s_flags & RVT_S_SEND_ONE) {
+ priv->s_flags &= ~RVT_S_SEND_ONE;
+ priv->s_flags |= RVT_S_WAIT_ACK;
+ bth2 |= IB_BTH_REQ_ACK;
+ }
+ qp->s_len -= len;
+ ps->s_txreq->hdr_dwords = hwords;
+ ps->s_txreq->sde = priv->s_sde;
+ ps->s_txreq->ss = ss;
+ ps->s_txreq->s_cur_size = len;
+ hfi1_make_ruc_header(qp, ohdr, (opcode << 24), bth1, bth2,
+ middle, ps);
+ return 1;
+bail:
+ hfi1_put_txreq(ps->s_txreq);
+bail_no_tx:
+ ps->s_txreq = NULL;
+ priv->s_flags &= ~RVT_S_BUSY;
+ /*
+ * If we didn't get a txreq, the QP will be woken up later to try
+ * again, set the flags to the the wake up which work item to wake
+ * up.
+ * (A better algorithm should be found to do this and generalize the
+ * sleep/wakeup flags.)
+ */
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ return 0;
+}
+
+static int make_tid_rdma_ack(struct rvt_qp *qp,
+ struct ib_other_headers *ohdr,
+ struct hfi1_pkt_state *ps)
+{
+ struct rvt_ack_entry *e;
+ struct hfi1_qp_priv *qpriv = qp->priv;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ u32 hwords, next;
+ u32 len = 0;
+ u32 bth1 = 0, bth2 = 0;
+ int middle = 0;
+ u16 flow;
+ struct tid_rdma_request *req, *nreq;
+
+ trace_hfi1_tid_write_rsp_make_tid_ack(qp);
+ /* Don't send an ACK if we aren't supposed to. */
+ if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK))
+ goto bail;
+
+ /* header size in 32-bit words LRH+BTH = (8+12)/4. */
+ hwords = 5;
+
+ e = &qp->s_ack_queue[qpriv->r_tid_ack];
+ req = ack_to_tid_req(e);
+ /*
+ * In the RESYNC case, we are exactly one segment past the
+ * previously sent ack or at the previously sent NAK. So to send
+ * the resync ack, we go back one segment (which might be part of
+ * the previous request) and let the do-while loop execute again.
+ * The advantage of executing the do-while loop is that any data
+ * received after the previous ack is automatically acked in the
+ * RESYNC ack. It turns out that for the do-while loop we only need
+ * to pull back qpriv->r_tid_ack, not the segment
+ * indices/counters. The scheme works even if the previous request
+ * was not a TID WRITE request.
+ */
+ if (qpriv->resync) {
+ if (!req->ack_seg || req->ack_seg == req->total_segs)
+ qpriv->r_tid_ack = !qpriv->r_tid_ack ?
+ rvt_size_atomic(&dev->rdi) :
+ qpriv->r_tid_ack - 1;
+ e = &qp->s_ack_queue[qpriv->r_tid_ack];
+ req = ack_to_tid_req(e);
+ }
+
+ trace_hfi1_rsp_make_tid_ack(qp, e->psn);
+ trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ /*
+ * If we've sent all the ACKs that we can, we are done
+ * until we get more segments...
+ */
+ if (!qpriv->s_nak_state && !qpriv->resync &&
+ req->ack_seg == req->comp_seg)
+ goto bail;
+
+ do {
+ /*
+ * To deal with coalesced ACKs, the acked_tail pointer
+ * into the flow array is used. The distance between it
+ * and the clear_tail is the number of flows that are
+ * being ACK'ed.
+ */
+ req->ack_seg +=
+ /* Get up-to-date value */
+ CIRC_CNT(req->clear_tail, req->acked_tail,
+ MAX_FLOWS);
+ /* Advance acked index */
+ req->acked_tail = req->clear_tail;
+
+ /*
+ * req->clear_tail points to the segment currently being
+ * received. So, when sending an ACK, the previous
+ * segment is being ACK'ed.
+ */
+ flow = CIRC_PREV(req->acked_tail, MAX_FLOWS);
+ if (req->ack_seg != req->total_segs)
+ break;
+ req->state = TID_REQUEST_COMPLETE;
+
+ next = qpriv->r_tid_ack + 1;
+ if (next > rvt_size_atomic(&dev->rdi))
+ next = 0;
+ qpriv->r_tid_ack = next;
+ if (qp->s_ack_queue[next].opcode != TID_OP(WRITE_REQ))
+ break;
+ nreq = ack_to_tid_req(&qp->s_ack_queue[next]);
+ if (!nreq->comp_seg || nreq->ack_seg == nreq->comp_seg)
+ break;
+
+ /* Move to the next ack entry now */
+ e = &qp->s_ack_queue[qpriv->r_tid_ack];
+ req = ack_to_tid_req(e);
+ } while (1);
+
+ /*
+ * At this point qpriv->r_tid_ack == qpriv->r_tid_tail but e and
+ * req could be pointing at the previous ack queue entry
+ */
+ if (qpriv->s_nak_state ||
+ (qpriv->resync &&
+ !hfi1_tid_rdma_is_resync_psn(qpriv->r_next_psn_kdeth - 1) &&
+ (cmp_psn(qpriv->r_next_psn_kdeth - 1,
+ full_flow_psn(&req->flows[flow],
+ req->flows[flow].flow_state.lpsn)) > 0))) {
+ /*
+ * A NAK will implicitly acknowledge all previous TID RDMA
+ * requests. Therefore, we NAK with the req->acked_tail
+ * segment for the request at qpriv->r_tid_ack (same at
+ * this point as the req->clear_tail segment for the
+ * qpriv->r_tid_tail request)
+ */
+ e = &qp->s_ack_queue[qpriv->r_tid_ack];
+ req = ack_to_tid_req(e);
+ flow = req->acked_tail;
+ } else if (req->ack_seg == req->total_segs &&
+ qpriv->s_flags & HFI1_R_TID_WAIT_INTERLCK)
+ qpriv->s_flags &= ~HFI1_R_TID_WAIT_INTERLCK;
+
+ trace_hfi1_tid_write_rsp_make_tid_ack(qp);
+ trace_hfi1_tid_req_make_tid_ack(qp, 0, e->opcode, e->psn, e->lpsn,
+ req);
+ hwords += hfi1_build_tid_rdma_write_ack(qp, e, ohdr, flow, &bth1,
+ &bth2);
+ len = 0;
+ qpriv->s_flags &= ~RVT_S_ACK_PENDING;
+ ps->s_txreq->hdr_dwords = hwords;
+ ps->s_txreq->sde = qpriv->s_sde;
+ ps->s_txreq->s_cur_size = len;
+ ps->s_txreq->ss = NULL;
+ hfi1_make_ruc_header(qp, ohdr, (TID_OP(ACK) << 24), bth1, bth2, middle,
+ ps);
+ ps->s_txreq->txreq.flags |= SDMA_TXREQ_F_VIP;
+ return 1;
+bail:
+ /*
+ * Ensure s_rdma_ack_cnt changes are committed prior to resetting
+ * RVT_S_RESP_PENDING
+ */
+ smp_wmb();
+ qpriv->s_flags &= ~RVT_S_ACK_PENDING;
+ return 0;
+}
+
+static int hfi1_send_tid_ok(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ return !(priv->s_flags & RVT_S_BUSY ||
+ qp->s_flags & HFI1_S_ANY_WAIT_IO) &&
+ (verbs_txreq_queued(iowait_get_tid_work(&priv->s_iowait)) ||
+ (priv->s_flags & RVT_S_RESP_PENDING) ||
+ !(qp->s_flags & HFI1_S_ANY_TID_WAIT_SEND));
+}
+
+void _hfi1_do_tid_send(struct work_struct *work)
+{
+ struct iowait_work *w = container_of(work, struct iowait_work, iowork);
+ struct rvt_qp *qp = iowait_to_qp(w->iow);
+
+ hfi1_do_tid_send(qp);
+}
+
+static void hfi1_do_tid_send(struct rvt_qp *qp)
+{
+ struct hfi1_pkt_state ps;
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ ps.dev = to_idev(qp->ibqp.device);
+ ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
+ ps.ppd = ppd_from_ibp(ps.ibp);
+ ps.wait = iowait_get_tid_work(&priv->s_iowait);
+ ps.in_thread = false;
+ ps.timeout_int = qp->timeout_jiffies / 8;
+
+ trace_hfi1_rc_do_tid_send(qp, false);
+ spin_lock_irqsave(&qp->s_lock, ps.flags);
+
+ /* Return if we are already busy processing a work request. */
+ if (!hfi1_send_tid_ok(qp)) {
+ if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+ iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_TID);
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+ return;
+ }
+
+ priv->s_flags |= RVT_S_BUSY;
+
+ ps.timeout = jiffies + ps.timeout_int;
+ ps.cpu = priv->s_sde ? priv->s_sde->cpu :
+ cpumask_first(cpumask_of_node(ps.ppd->dd->node));
+ ps.pkts_sent = false;
+
+ /* insure a pre-built packet is handled */
+ ps.s_txreq = get_waiting_verbs_txreq(ps.wait);
+ do {
+ /* Check for a constructed packet to be sent. */
+ if (ps.s_txreq) {
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+ qp->s_flags |= RVT_S_BUSY;
+ ps.wait = iowait_get_ib_work(&priv->s_iowait);
+ }
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+
+ /*
+ * If the packet cannot be sent now, return and
+ * the send tasklet will be woken up later.
+ */
+ if (hfi1_verbs_send(qp, &ps))
+ return;
+
+ /* allow other tasks to run */
+ if (hfi1_schedule_send_yield(qp, &ps, true))
+ return;
+
+ spin_lock_irqsave(&qp->s_lock, ps.flags);
+ if (priv->s_flags & HFI1_S_TID_BUSY_SET) {
+ qp->s_flags &= ~RVT_S_BUSY;
+ priv->s_flags &= ~HFI1_S_TID_BUSY_SET;
+ ps.wait = iowait_get_tid_work(&priv->s_iowait);
+ if (iowait_flag_set(&priv->s_iowait,
+ IOWAIT_PENDING_IB))
+ hfi1_schedule_send(qp);
+ }
+ }
+ } while (hfi1_make_tid_rdma_pkt(qp, &ps));
+ iowait_starve_clear(ps.pkts_sent, &priv->s_iowait);
+ spin_unlock_irqrestore(&qp->s_lock, ps.flags);
+}
+
+static bool _hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct hfi1_ibport *ibp =
+ to_iport(qp->ibqp.device, qp->port_num);
+ struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+
+ return iowait_tid_schedule(&priv->s_iowait, ppd->hfi1_wq,
+ priv->s_sde ?
+ priv->s_sde->cpu :
+ cpumask_first(cpumask_of_node(dd->node)));
+}
+
+/**
+ * hfi1_schedule_tid_send - schedule progress on TID RDMA state machine
+ * @qp: the QP
+ *
+ * This schedules qp progress on the TID RDMA state machine. Caller
+ * should hold the s_lock.
+ * Unlike hfi1_schedule_send(), this cannot use hfi1_send_ok() because
+ * the two state machines can step on each other with respect to the
+ * RVT_S_BUSY flag.
+ * Therefore, a modified test is used.
+ * @return true if the second leg is scheduled;
+ * false if the second leg is not scheduled.
+ */
+bool hfi1_schedule_tid_send(struct rvt_qp *qp)
+{
+ lockdep_assert_held(&qp->s_lock);
+ if (hfi1_send_tid_ok(qp)) {
+ /*
+ * The following call returns true if the qp is not on the
+ * queue and false if the qp is already on the queue before
+ * this call. Either way, the qp will be on the queue when the
+ * call returns.
+ */
+ _hfi1_schedule_tid_send(qp);
+ return true;
+ }
+ if (qp->s_flags & HFI1_S_ANY_WAIT_IO)
+ iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait,
+ IOWAIT_PENDING_TID);
+ return false;
+}
+
+bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e)
+{
+ struct rvt_ack_entry *prev;
+ struct tid_rdma_request *req;
+ struct hfi1_ibdev *dev = to_idev(qp->ibqp.device);
+ struct hfi1_qp_priv *priv = qp->priv;
+ u32 s_prev;
+
+ s_prev = qp->s_tail_ack_queue == 0 ? rvt_size_atomic(&dev->rdi) :
+ (qp->s_tail_ack_queue - 1);
+ prev = &qp->s_ack_queue[s_prev];
+
+ if ((e->opcode == TID_OP(READ_REQ) ||
+ e->opcode == OP(RDMA_READ_REQUEST)) &&
+ prev->opcode == TID_OP(WRITE_REQ)) {
+ req = ack_to_tid_req(prev);
+ if (req->ack_seg != req->total_segs) {
+ priv->s_flags |= HFI1_R_TID_WAIT_INTERLCK;
+ return true;
+ }
+ }
+ return false;
+}
+
+static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx)
+{
+ u64 reg;
+
+ /*
+ * The only sane way to get the amount of
+ * progress is to read the HW flow state.
+ */
+ reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (8 * fidx));
+ return mask_psn(reg);
+}
+
+static void tid_rdma_rcv_err(struct hfi1_packet *packet,
+ struct ib_other_headers *ohdr,
+ struct rvt_qp *qp, u32 psn, int diff, bool fecn)
+{
+ unsigned long flags;
+
+ tid_rdma_rcv_error(packet, ohdr, qp, psn, diff);
+ if (fecn) {
+ spin_lock_irqsave(&qp->s_lock, flags);
+ qp->s_flags |= RVT_S_ECN;
+ spin_unlock_irqrestore(&qp->s_lock, flags);
+ }
+}
+
+static void update_r_next_psn_fecn(struct hfi1_packet *packet,
+ struct hfi1_qp_priv *priv,
+ struct hfi1_ctxtdata *rcd,
+ struct tid_rdma_flow *flow,
+ bool fecn)
+{
+ /*
+ * If a start/middle packet is delivered here due to
+ * RSM rule and FECN, we need to update the r_next_psn.
+ */
+ if (fecn && packet->etype == RHF_RCV_TYPE_EAGER &&
+ !(priv->s_flags & HFI1_R_TID_SW_PSN)) {
+ struct hfi1_devdata *dd = rcd->dd;
+
+ flow->flow_state.r_next_psn =
+ read_r_next_psn(dd, rcd->ctxt, flow->idx);
+ }
+}
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.h b/drivers/infiniband/hw/hfi1/tid_rdma.h
new file mode 100644
index 0000000..6e82df2
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.h
@@ -0,0 +1,319 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#ifndef HFI1_TID_RDMA_H
+#define HFI1_TID_RDMA_H
+
+#include <linux/circ_buf.h>
+#include "common.h"
+
+/* Add a convenience helper */
+#define CIRC_ADD(val, add, size) (((val) + (add)) & ((size) - 1))
+#define CIRC_NEXT(val, size) CIRC_ADD(val, 1, size)
+#define CIRC_PREV(val, size) CIRC_ADD(val, -1, size)
+
+#define TID_RDMA_MIN_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */
+#define TID_RDMA_MAX_SEGMENT_SIZE BIT(18) /* 256 KiB (for now) */
+#define TID_RDMA_MAX_PAGES (BIT(18) >> PAGE_SHIFT)
+#define TID_RDMA_SEGMENT_SHIFT 18
+
+/*
+ * Bit definitions for priv->s_flags.
+ * These bit flags overload the bit flags defined for the QP's s_flags.
+ * Due to the fact that these bit fields are used only for the QP priv
+ * s_flags, there are no collisions.
+ *
+ * HFI1_S_TID_WAIT_INTERLCK - QP is waiting for requester interlock
+ * HFI1_R_TID_WAIT_INTERLCK - QP is waiting for responder interlock
+ */
+#define HFI1_S_TID_BUSY_SET BIT(0)
+/* BIT(1) reserved for RVT_S_BUSY. */
+#define HFI1_R_TID_RSC_TIMER BIT(2)
+/* BIT(3) reserved for RVT_S_RESP_PENDING. */
+/* BIT(4) reserved for RVT_S_ACK_PENDING. */
+#define HFI1_S_TID_WAIT_INTERLCK BIT(5)
+#define HFI1_R_TID_WAIT_INTERLCK BIT(6)
+/* BIT(7) - BIT(15) reserved for RVT_S_WAIT_*. */
+/* BIT(16) reserved for RVT_S_SEND_ONE */
+#define HFI1_S_TID_RETRY_TIMER BIT(17)
+/* BIT(18) reserved for RVT_S_ECN. */
+#define HFI1_R_TID_SW_PSN BIT(19)
+/* BIT(26) reserved for HFI1_S_WAIT_HALT */
+/* BIT(27) reserved for HFI1_S_WAIT_TID_RESP */
+/* BIT(28) reserved for HFI1_S_WAIT_TID_SPACE */
+
+/*
+ * Unlike regular IB RDMA VERBS, which do not require an entry
+ * in the s_ack_queue, TID RDMA WRITE requests do because they
+ * generate responses.
+ * Therefore, the s_ack_queue needs to be extended by a certain
+ * amount. The key point is that the queue needs to be extended
+ * without letting the "user" know so they user doesn't end up
+ * using these extra entries.
+ */
+#define HFI1_TID_RDMA_WRITE_CNT 8
+
+struct tid_rdma_params {
+ struct rcu_head rcu_head;
+ u32 qp;
+ u32 max_len;
+ u16 jkey;
+ u8 max_read;
+ u8 max_write;
+ u8 timeout;
+ u8 urg;
+ u8 version;
+};
+
+struct tid_rdma_qp_params {
+ struct work_struct trigger_work;
+ struct tid_rdma_params local;
+ struct tid_rdma_params __rcu *remote;
+};
+
+/* Track state for each hardware flow */
+struct tid_flow_state {
+ u32 generation;
+ u32 psn;
+ u8 index;
+ u8 last_index;
+};
+
+enum tid_rdma_req_state {
+ TID_REQUEST_INACTIVE = 0,
+ TID_REQUEST_INIT,
+ TID_REQUEST_INIT_RESEND,
+ TID_REQUEST_ACTIVE,
+ TID_REQUEST_RESEND,
+ TID_REQUEST_RESEND_ACTIVE,
+ TID_REQUEST_QUEUED,
+ TID_REQUEST_SYNC,
+ TID_REQUEST_RNR_NAK,
+ TID_REQUEST_COMPLETE,
+};
+
+struct tid_rdma_request {
+ struct rvt_qp *qp;
+ struct hfi1_ctxtdata *rcd;
+ union {
+ struct rvt_swqe *swqe;
+ struct rvt_ack_entry *ack;
+ } e;
+
+ struct tid_rdma_flow *flows; /* array of tid flows */
+ struct rvt_sge_state ss; /* SGE state for TID RDMA requests */
+ u16 n_flows; /* size of the flow buffer window */
+ u16 setup_head; /* flow index we are setting up */
+ u16 clear_tail; /* flow index we are clearing */
+ u16 flow_idx; /* flow index most recently set up */
+ u16 acked_tail;
+
+ u32 seg_len;
+ u32 total_len;
+ u32 r_ack_psn; /* next expected ack PSN */
+ u32 r_flow_psn; /* IB PSN of next segment start */
+ u32 r_last_acked; /* IB PSN of last ACK'ed packet */
+ u32 s_next_psn; /* IB PSN of next segment start for read */
+
+ u32 total_segs; /* segments required to complete a request */
+ u32 cur_seg; /* index of current segment */
+ u32 comp_seg; /* index of last completed segment */
+ u32 ack_seg; /* index of last ack'ed segment */
+ u32 alloc_seg; /* index of next segment to be allocated */
+ u32 isge; /* index of "current" sge */
+ u32 ack_pending; /* num acks pending for this request */
+
+ enum tid_rdma_req_state state;
+};
+
+/*
+ * When header suppression is used, PSNs associated with a "flow" are
+ * relevant (and not the PSNs maintained by verbs). Track per-flow
+ * PSNs here for a TID RDMA segment.
+ *
+ */
+struct flow_state {
+ u32 flags;
+ u32 resp_ib_psn; /* The IB PSN of the response for this flow */
+ u32 generation; /* generation of flow */
+ u32 spsn; /* starting PSN in TID space */
+ u32 lpsn; /* last PSN in TID space */
+ u32 r_next_psn; /* next PSN to be received (in TID space) */
+
+ /* For tid rdma read */
+ u32 ib_spsn; /* starting PSN in Verbs space */
+ u32 ib_lpsn; /* last PSn in Verbs space */
+};
+
+struct tid_rdma_pageset {
+ dma_addr_t addr : 48; /* Only needed for the first page */
+ u8 idx: 8;
+ u8 count : 7;
+ u8 mapped: 1;
+};
+
+/**
+ * kern_tid_node - used for managing TID's in TID groups
+ *
+ * @grp_idx: rcd relative index to tid_group
+ * @map: grp->map captured prior to programming this TID group in HW
+ * @cnt: Only @cnt of available group entries are actually programmed
+ */
+struct kern_tid_node {
+ struct tid_group *grp;
+ u8 map;
+ u8 cnt;
+};
+
+/* Overall info for a TID RDMA segment */
+struct tid_rdma_flow {
+ /*
+ * While a TID RDMA segment is being transferred, it uses a QP number
+ * from the "KDETH section of QP numbers" (which is different from the
+ * QP number that originated the request). Bits 11-15 of these QP
+ * numbers identify the "TID flow" for the segment.
+ */
+ struct flow_state flow_state;
+ struct tid_rdma_request *req;
+ u32 tid_qpn;
+ u32 tid_offset;
+ u32 length;
+ u32 sent;
+ u8 tnode_cnt;
+ u8 tidcnt;
+ u8 tid_idx;
+ u8 idx;
+ u8 npagesets;
+ u8 npkts;
+ u8 pkt;
+ u8 resync_npkts;
+ struct kern_tid_node tnode[TID_RDMA_MAX_PAGES];
+ struct tid_rdma_pageset pagesets[TID_RDMA_MAX_PAGES];
+ u32 tid_entry[TID_RDMA_MAX_PAGES];
+};
+
+enum tid_rnr_nak_state {
+ TID_RNR_NAK_INIT = 0,
+ TID_RNR_NAK_SEND,
+ TID_RNR_NAK_SENT,
+};
+
+bool tid_rdma_conn_req(struct rvt_qp *qp, u64 *data);
+bool tid_rdma_conn_reply(struct rvt_qp *qp, u64 data);
+bool tid_rdma_conn_resp(struct rvt_qp *qp, u64 *data);
+void tid_rdma_conn_error(struct rvt_qp *qp);
+void tid_rdma_opfn_init(struct rvt_qp *qp, struct tid_rdma_params *p);
+
+int hfi1_kern_exp_rcv_init(struct hfi1_ctxtdata *rcd, int reinit);
+int hfi1_kern_exp_rcv_setup(struct tid_rdma_request *req,
+ struct rvt_sge_state *ss, bool *last);
+int hfi1_kern_exp_rcv_clear(struct tid_rdma_request *req);
+void hfi1_kern_exp_rcv_clear_all(struct tid_rdma_request *req);
+void __trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+/**
+ * trdma_clean_swqe - clean flows for swqe if large send queue
+ * @qp: the qp
+ * @wqe: the send wqe
+ */
+static inline void trdma_clean_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+ if (!wqe->priv)
+ return;
+ __trdma_clean_swqe(qp, wqe);
+}
+
+void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp);
+
+int hfi1_qp_priv_init(struct rvt_dev_info *rdi, struct rvt_qp *qp,
+ struct ib_qp_init_attr *init_attr);
+void hfi1_qp_priv_tid_free(struct rvt_dev_info *rdi, struct rvt_qp *qp);
+
+void hfi1_tid_rdma_flush_wait(struct rvt_qp *qp);
+
+int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp);
+void hfi1_kern_clear_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp);
+void hfi1_kern_init_ctxt_generations(struct hfi1_ctxtdata *rcd);
+
+struct cntr_entry;
+u64 hfi1_access_sw_tid_wait(const struct cntr_entry *entry,
+ void *context, int vl, int mode, u64 data);
+
+u32 hfi1_build_tid_rdma_read_packet(struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len);
+u32 hfi1_build_tid_rdma_read_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u32 *len);
+void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet);
+u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u32 *bth0,
+ u32 *bth1, u32 *bth2, u32 *len, bool *last);
+void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet);
+bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
+ struct hfi1_pportdata *ppd,
+ struct hfi1_packet *packet);
+void hfi1_tid_rdma_restart_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ u32 *bth2);
+void hfi1_qp_kern_exp_rcv_clear_all(struct rvt_qp *qp);
+bool hfi1_tid_rdma_wqe_interlock(struct rvt_qp *qp, struct rvt_swqe *wqe);
+
+void setup_tid_rdma_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+static inline void hfi1_setup_tid_rdma_wqe(struct rvt_qp *qp,
+ struct rvt_swqe *wqe)
+{
+ if (wqe->priv &&
+ (wqe->wr.opcode == IB_WR_RDMA_READ ||
+ wqe->wr.opcode == IB_WR_RDMA_WRITE) &&
+ wqe->length >= TID_RDMA_MIN_SEGMENT_SIZE)
+ setup_tid_rdma_wqe(qp, wqe);
+}
+
+u32 hfi1_build_tid_rdma_write_req(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len);
+
+void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet);
+
+u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 bth2, u32 *len,
+ struct rvt_sge_state **ss);
+
+void hfi1_del_tid_reap_timer(struct rvt_qp *qp);
+
+void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet);
+
+bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr,
+ u32 *bth1, u32 *bth2, u32 *len);
+
+void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet);
+
+u32 hfi1_build_tid_rdma_write_ack(struct rvt_qp *qp, struct rvt_ack_entry *e,
+ struct ib_other_headers *ohdr, u16 iflow,
+ u32 *bth1, u32 *bth2);
+
+void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet);
+
+void hfi1_add_tid_retry_timer(struct rvt_qp *qp);
+void hfi1_del_tid_retry_timer(struct rvt_qp *qp);
+
+u32 hfi1_build_tid_rdma_resync(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ struct ib_other_headers *ohdr, u32 *bth1,
+ u32 *bth2, u16 fidx);
+
+void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet);
+
+struct hfi1_pkt_state;
+int hfi1_make_tid_rdma_pkt(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
+
+void _hfi1_do_tid_send(struct work_struct *work);
+
+bool hfi1_schedule_tid_send(struct rvt_qp *qp);
+
+bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e);
+
+#endif /* HFI1_TID_RDMA_H */
diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c
index 7c8aed0..9a3d236 100644
--- a/drivers/infiniband/hw/hfi1/trace.c
+++ b/drivers/infiniband/hw/hfi1/trace.c
@@ -46,6 +46,7 @@
*/
#define CREATE_TRACE_POINTS
#include "trace.h"
+#include "exp_rcv.h"
static u8 __get_ib_hdr_len(struct ib_header *hdr)
{
@@ -128,6 +129,15 @@
#define IETH_PRN "ieth rkey:0x%.8x"
#define ATOMICACKETH_PRN "origdata:%llx"
#define ATOMICETH_PRN "vaddr:0x%llx rkey:0x%.8x sdata:%llx cdata:%llx"
+#define TID_RDMA_KDETH "kdeth0 0x%x kdeth1 0x%x"
+#define TID_RDMA_KDETH_DATA "kdeth0 0x%x: kver %u sh %u intr %u tidctrl %u tid %x offset %x kdeth1 0x%x: jkey %x"
+#define TID_READ_REQ_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
+#define TID_READ_RSP_PRN "verbs_qp 0x%x"
+#define TID_WRITE_REQ_PRN "original_qp 0x%x"
+#define TID_WRITE_RSP_PRN "tid_flow_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
+#define TID_WRITE_DATA_PRN "verbs_qp 0x%x"
+#define TID_ACK_PRN "tid_flow_psn 0x%x verbs_psn 0x%x tid_flow_qp 0x%x verbs_qp 0x%x"
+#define TID_RESYNC_PRN "verbs_qp 0x%x"
#define OP(transport, op) IB_OPCODE_## transport ## _ ## op
@@ -322,6 +332,99 @@
parse_syndrome(be32_to_cpu(eh->aeth) >> 24),
be32_to_cpu(eh->aeth) & IB_MSN_MASK);
break;
+ case OP(TID_RDMA, WRITE_REQ):
+ trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " "
+ TID_WRITE_REQ_PRN,
+ le32_to_cpu(eh->tid_rdma.w_req.kdeth0),
+ le32_to_cpu(eh->tid_rdma.w_req.kdeth1),
+ ib_u64_get(&eh->tid_rdma.w_req.reth.vaddr),
+ be32_to_cpu(eh->tid_rdma.w_req.reth.rkey),
+ be32_to_cpu(eh->tid_rdma.w_req.reth.length),
+ be32_to_cpu(eh->tid_rdma.w_req.verbs_qp));
+ break;
+ case OP(TID_RDMA, WRITE_RESP):
+ trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " "
+ TID_WRITE_RSP_PRN,
+ le32_to_cpu(eh->tid_rdma.w_rsp.kdeth0),
+ le32_to_cpu(eh->tid_rdma.w_rsp.kdeth1),
+ be32_to_cpu(eh->tid_rdma.w_rsp.aeth) >> 24,
+ parse_syndrome(/* aeth */
+ be32_to_cpu(eh->tid_rdma.w_rsp.aeth)
+ >> 24),
+ (be32_to_cpu(eh->tid_rdma.w_rsp.aeth) &
+ IB_MSN_MASK),
+ be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_psn),
+ be32_to_cpu(eh->tid_rdma.w_rsp.tid_flow_qp),
+ be32_to_cpu(eh->tid_rdma.w_rsp.verbs_qp));
+ break;
+ case OP(TID_RDMA, WRITE_DATA_LAST):
+ case OP(TID_RDMA, WRITE_DATA):
+ trace_seq_printf(p, TID_RDMA_KDETH_DATA " " TID_WRITE_DATA_PRN,
+ le32_to_cpu(eh->tid_rdma.w_data.kdeth0),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, KVER),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, SH),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, INTR),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, TIDCTRL),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, TID),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth0, OFFSET),
+ le32_to_cpu(eh->tid_rdma.w_data.kdeth1),
+ KDETH_GET(eh->tid_rdma.w_data.kdeth1, JKEY),
+ be32_to_cpu(eh->tid_rdma.w_data.verbs_qp));
+ break;
+ case OP(TID_RDMA, READ_REQ):
+ trace_seq_printf(p, TID_RDMA_KDETH " " RETH_PRN " "
+ TID_READ_REQ_PRN,
+ le32_to_cpu(eh->tid_rdma.r_req.kdeth0),
+ le32_to_cpu(eh->tid_rdma.r_req.kdeth1),
+ ib_u64_get(&eh->tid_rdma.r_req.reth.vaddr),
+ be32_to_cpu(eh->tid_rdma.r_req.reth.rkey),
+ be32_to_cpu(eh->tid_rdma.r_req.reth.length),
+ be32_to_cpu(eh->tid_rdma.r_req.tid_flow_psn),
+ be32_to_cpu(eh->tid_rdma.r_req.tid_flow_qp),
+ be32_to_cpu(eh->tid_rdma.r_req.verbs_qp));
+ break;
+ case OP(TID_RDMA, READ_RESP):
+ trace_seq_printf(p, TID_RDMA_KDETH_DATA " " AETH_PRN " "
+ TID_READ_RSP_PRN,
+ le32_to_cpu(eh->tid_rdma.r_rsp.kdeth0),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, KVER),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, SH),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, INTR),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TIDCTRL),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, TID),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth0, OFFSET),
+ le32_to_cpu(eh->tid_rdma.r_rsp.kdeth1),
+ KDETH_GET(eh->tid_rdma.r_rsp.kdeth1, JKEY),
+ be32_to_cpu(eh->tid_rdma.r_rsp.aeth) >> 24,
+ parse_syndrome(/* aeth */
+ be32_to_cpu(eh->tid_rdma.r_rsp.aeth)
+ >> 24),
+ (be32_to_cpu(eh->tid_rdma.r_rsp.aeth) &
+ IB_MSN_MASK),
+ be32_to_cpu(eh->tid_rdma.r_rsp.verbs_qp));
+ break;
+ case OP(TID_RDMA, ACK):
+ trace_seq_printf(p, TID_RDMA_KDETH " " AETH_PRN " "
+ TID_ACK_PRN,
+ le32_to_cpu(eh->tid_rdma.ack.kdeth0),
+ le32_to_cpu(eh->tid_rdma.ack.kdeth1),
+ be32_to_cpu(eh->tid_rdma.ack.aeth) >> 24,
+ parse_syndrome(/* aeth */
+ be32_to_cpu(eh->tid_rdma.ack.aeth)
+ >> 24),
+ (be32_to_cpu(eh->tid_rdma.ack.aeth) &
+ IB_MSN_MASK),
+ be32_to_cpu(eh->tid_rdma.ack.tid_flow_psn),
+ be32_to_cpu(eh->tid_rdma.ack.verbs_psn),
+ be32_to_cpu(eh->tid_rdma.ack.tid_flow_qp),
+ be32_to_cpu(eh->tid_rdma.ack.verbs_qp));
+ break;
+ case OP(TID_RDMA, RESYNC):
+ trace_seq_printf(p, TID_RDMA_KDETH " " TID_RESYNC_PRN,
+ le32_to_cpu(eh->tid_rdma.resync.kdeth0),
+ le32_to_cpu(eh->tid_rdma.resync.kdeth1),
+ be32_to_cpu(eh->tid_rdma.resync.verbs_qp));
+ break;
/* aeth + atomicacketh */
case OP(RC, ATOMIC_ACKNOWLEDGE):
trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN,
@@ -394,6 +497,21 @@
return ret;
}
+u8 hfi1_trace_get_tid_ctrl(u32 ent)
+{
+ return EXP_TID_GET(ent, CTRL);
+}
+
+u16 hfi1_trace_get_tid_len(u32 ent)
+{
+ return EXP_TID_GET(ent, LEN);
+}
+
+u16 hfi1_trace_get_tid_idx(u32 ent)
+{
+ return EXP_TID_GET(ent, IDX);
+}
+
__hfi1_trace_fn(AFFINITY);
__hfi1_trace_fn(PKT);
__hfi1_trace_fn(PROC);
diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h
index 8540463..1ce5518 100644
--- a/drivers/infiniband/hw/hfi1/trace.h
+++ b/drivers/infiniband/hw/hfi1/trace.h
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -62,3 +62,5 @@
#include "trace_rx.h"
#include "trace_tx.h"
#include "trace_mmu.h"
+#include "trace_iowait.h"
+#include "trace_tid.h"
diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h
index e62171f..de7a873 100644
--- a/drivers/infiniband/hw/hfi1/trace_dbg.h
+++ b/drivers/infiniband/hw/hfi1/trace_dbg.h
@@ -86,14 +86,14 @@
* actual function to work and can not be in a macro.
*/
#define __hfi1_trace_def(lvl) \
-void __hfi1_trace_##lvl(const char *funct, char *fmt, ...); \
+void __printf(2, 3) __hfi1_trace_##lvl(const char *funct, char *fmt, ...); \
\
DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl, \
TP_PROTO(const char *function, struct va_format *vaf), \
TP_ARGS(function, vaf))
#define __hfi1_trace_fn(lvl) \
-void __hfi1_trace_##lvl(const char *func, char *fmt, ...) \
+void __printf(2, 3) __hfi1_trace_##lvl(const char *func, char *fmt, ...)\
{ \
struct va_format vaf = { \
.fmt = fmt, \
diff --git a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
index 1dc2c28..2f84290 100644
--- a/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
+++ b/drivers/infiniband/hw/hfi1/trace_ibhdrs.h
@@ -79,6 +79,16 @@
ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \
ib_opcode_name(RC_COMPARE_SWAP), \
ib_opcode_name(RC_FETCH_ADD), \
+ ib_opcode_name(RC_SEND_LAST_WITH_INVALIDATE), \
+ ib_opcode_name(RC_SEND_ONLY_WITH_INVALIDATE), \
+ ib_opcode_name(TID_RDMA_WRITE_REQ), \
+ ib_opcode_name(TID_RDMA_WRITE_RESP), \
+ ib_opcode_name(TID_RDMA_WRITE_DATA), \
+ ib_opcode_name(TID_RDMA_WRITE_DATA_LAST), \
+ ib_opcode_name(TID_RDMA_READ_REQ), \
+ ib_opcode_name(TID_RDMA_READ_RESP), \
+ ib_opcode_name(TID_RDMA_RESYNC), \
+ ib_opcode_name(TID_RDMA_ACK), \
ib_opcode_name(UC_SEND_FIRST), \
ib_opcode_name(UC_SEND_MIDDLE), \
ib_opcode_name(UC_SEND_LAST), \
diff --git a/drivers/infiniband/hw/hfi1/trace_iowait.h b/drivers/infiniband/hw/hfi1/trace_iowait.h
new file mode 100644
index 0000000..27f4334
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_iowait.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#if !defined(__HFI1_TRACE_IOWAIT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_IOWAIT_H
+
+#include <linux/tracepoint.h>
+#include "iowait.h"
+#include "verbs.h"
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_iowait
+
+DECLARE_EVENT_CLASS(hfi1_iowait_template,
+ TP_PROTO(struct iowait *wait, u32 flag),
+ TP_ARGS(wait, flag),
+ TP_STRUCT__entry(/* entry */
+ __field(unsigned long, addr)
+ __field(unsigned long, flags)
+ __field(u32, flag)
+ __field(u32, qpn)
+ ),
+ TP_fast_assign(/* assign */
+ __entry->addr = (unsigned long)wait;
+ __entry->flags = wait->flags;
+ __entry->flag = (1 << flag);
+ __entry->qpn = iowait_to_qp(wait)->ibqp.qp_num;
+ ),
+ TP_printk(/* print */
+ "iowait 0x%lx qp %u flags 0x%lx flag 0x%x",
+ __entry->addr,
+ __entry->qpn,
+ __entry->flags,
+ __entry->flag
+ )
+ );
+
+DEFINE_EVENT(hfi1_iowait_template, hfi1_iowait_set,
+ TP_PROTO(struct iowait *wait, u32 flag),
+ TP_ARGS(wait, flag));
+
+DEFINE_EVENT(hfi1_iowait_template, hfi1_iowait_clear,
+ TP_PROTO(struct iowait *wait, u32 flag),
+ TP_ARGS(wait, flag));
+
+#endif /* __HFI1_TRACE_IOWAIT_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_iowait
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_rc.h b/drivers/infiniband/hw/hfi1/trace_rc.h
index 8ce4765..1ebca37 100644
--- a/drivers/infiniband/hw/hfi1/trace_rc.h
+++ b/drivers/infiniband/hw/hfi1/trace_rc.h
@@ -109,6 +109,54 @@
TP_ARGS(qp, psn)
);
+DEFINE_EVENT(/* event */
+ hfi1_rc_template, hfi1_rc_completion,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DECLARE_EVENT_CLASS(/* rc_ack */
+ hfi1_rc_ack_template,
+ TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+ struct rvt_swqe *wqe),
+ TP_ARGS(qp, aeth, psn, wqe),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, aeth)
+ __field(u32, psn)
+ __field(u8, opcode)
+ __field(u32, spsn)
+ __field(u32, lpsn)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->aeth = aeth;
+ __entry->psn = psn;
+ __entry->opcode = wqe->wr.opcode;
+ __entry->spsn = wqe->psn;
+ __entry->lpsn = wqe->lpsn;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x aeth 0x%x psn 0x%x opcode 0x%x spsn 0x%x lpsn 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->aeth,
+ __entry->psn,
+ __entry->opcode,
+ __entry->spsn,
+ __entry->lpsn
+ )
+);
+
+DEFINE_EVENT(/* do_rc_ack */
+ hfi1_rc_ack_template, hfi1_rc_ack_do,
+ TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+ struct rvt_swqe *wqe),
+ TP_ARGS(qp, aeth, psn, wqe)
+);
+
#endif /* __HFI1_TRACE_RC_H */
#undef TRACE_INCLUDE_PATH
diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h
index 7eceb57..3cec960 100644
--- a/drivers/infiniband/hw/hfi1/trace_rx.h
+++ b/drivers/infiniband/hw/hfi1/trace_rx.h
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -128,111 +128,6 @@
)
);
-DECLARE_EVENT_CLASS(
- hfi1_exp_tid_reg_unreg,
- TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr,
- u32 npages, unsigned long va, unsigned long pa,
- dma_addr_t dma),
- TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
- TP_STRUCT__entry(
- __field(unsigned int, ctxt)
- __field(u16, subctxt)
- __field(u32, rarr)
- __field(u32, npages)
- __field(unsigned long, va)
- __field(unsigned long, pa)
- __field(dma_addr_t, dma)
- ),
- TP_fast_assign(
- __entry->ctxt = ctxt;
- __entry->subctxt = subctxt;
- __entry->rarr = rarr;
- __entry->npages = npages;
- __entry->va = va;
- __entry->pa = pa;
- __entry->dma = dma;
- ),
- TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
- __entry->ctxt,
- __entry->subctxt,
- __entry->rarr,
- __entry->npages,
- __entry->pa,
- __entry->va,
- __entry->dma
- )
- );
-
-DEFINE_EVENT(
- hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg,
- TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
- unsigned long va, unsigned long pa, dma_addr_t dma),
- TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma));
-
-DEFINE_EVENT(
- hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg,
- TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
- unsigned long va, unsigned long pa, dma_addr_t dma),
- TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma));
-
-TRACE_EVENT(
- hfi1_put_tid,
- TP_PROTO(struct hfi1_devdata *dd,
- u32 index, u32 type, unsigned long pa, u16 order),
- TP_ARGS(dd, index, type, pa, order),
- TP_STRUCT__entry(
- DD_DEV_ENTRY(dd)
- __field(unsigned long, pa);
- __field(u32, index);
- __field(u32, type);
- __field(u16, order);
- ),
- TP_fast_assign(
- DD_DEV_ASSIGN(dd);
- __entry->pa = pa;
- __entry->index = index;
- __entry->type = type;
- __entry->order = order;
- ),
- TP_printk("[%s] type %s pa %lx index %u order %u",
- __get_str(dev),
- show_tidtype(__entry->type),
- __entry->pa,
- __entry->index,
- __entry->order
- )
-);
-
-TRACE_EVENT(hfi1_exp_tid_inval,
- TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr,
- u32 npages, dma_addr_t dma),
- TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
- TP_STRUCT__entry(
- __field(unsigned int, ctxt)
- __field(u16, subctxt)
- __field(unsigned long, va)
- __field(u32, rarr)
- __field(u32, npages)
- __field(dma_addr_t, dma)
- ),
- TP_fast_assign(
- __entry->ctxt = ctxt;
- __entry->subctxt = subctxt;
- __entry->va = va;
- __entry->rarr = rarr;
- __entry->npages = npages;
- __entry->dma = dma;
- ),
- TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
- __entry->ctxt,
- __entry->subctxt,
- __entry->rarr,
- __entry->npages,
- __entry->va,
- __entry->dma
- )
- );
-
TRACE_EVENT(hfi1_mmu_invalidate,
TP_PROTO(unsigned int ctxt, u16 subctxt, const char *type,
unsigned long start, unsigned long end),
diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h
new file mode 100644
index 0000000..343fb98
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/trace_tid.h
@@ -0,0 +1,1642 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2018 Intel Corporation.
+ *
+ */
+#if !defined(__HFI1_TRACE_TID_H) || defined(TRACE_HEADER_MULTI_READ)
+#define __HFI1_TRACE_TID_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "hfi.h"
+
+#define tidtype_name(type) { PT_##type, #type }
+#define show_tidtype(type) \
+__print_symbolic(type, \
+ tidtype_name(EXPECTED), \
+ tidtype_name(EAGER), \
+ tidtype_name(INVALID)) \
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM hfi1_tid
+
+u8 hfi1_trace_get_tid_ctrl(u32 ent);
+u16 hfi1_trace_get_tid_len(u32 ent);
+u16 hfi1_trace_get_tid_idx(u32 ent);
+
+#define OPFN_PARAM_PRN "[%s] qpn 0x%x %s OPFN: qp 0x%x, max read %u, " \
+ "max write %u, max length %u, jkey 0x%x timeout %u " \
+ "urg %u"
+
+#define TID_FLOW_PRN "[%s] qpn 0x%x flow %d: idx %d resp_ib_psn 0x%x " \
+ "generation 0x%x fpsn 0x%x-%x r_next_psn 0x%x " \
+ "ib_psn 0x%x-%x npagesets %u tnode_cnt %u " \
+ "tidcnt %u tid_idx %u tid_offset %u length %u sent %u"
+
+#define TID_NODE_PRN "[%s] qpn 0x%x %s idx %u grp base 0x%x map 0x%x " \
+ "used %u cnt %u"
+
+#define RSP_INFO_PRN "[%s] qpn 0x%x state 0x%x s_state 0x%x psn 0x%x " \
+ "r_psn 0x%x r_state 0x%x r_flags 0x%x " \
+ "r_head_ack_queue %u s_tail_ack_queue %u " \
+ "s_acked_ack_queue %u s_ack_state 0x%x " \
+ "s_nak_state 0x%x s_flags 0x%x ps_flags 0x%x " \
+ "iow_flags 0x%lx"
+
+#define SENDER_INFO_PRN "[%s] qpn 0x%x state 0x%x s_cur %u s_tail %u " \
+ "s_head %u s_acked %u s_last %u s_psn 0x%x " \
+ "s_last_psn 0x%x s_flags 0x%x ps_flags 0x%x " \
+ "iow_flags 0x%lx s_state 0x%x s_num_rd %u s_retry %u"
+
+#define TID_READ_SENDER_PRN "[%s] qpn 0x%x newreq %u tid_r_reqs %u " \
+ "tid_r_comp %u pending_tid_r_segs %u " \
+ "s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \
+ "s_state 0x%x hw_flow_index %u generation 0x%x " \
+ "fpsn 0x%x"
+
+#define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \
+ "cur_seg %u comp_seg %u ack_seg %u alloc_seg %u " \
+ "total_segs %u setup_head %u clear_tail %u flow_idx %u " \
+ "acked_tail %u state %u r_ack_psn 0x%x r_flow_psn 0x%x " \
+ "r_last_ackd 0x%x s_next_psn 0x%x"
+
+#define RCV_ERR_PRN "[%s] qpn 0x%x s_flags 0x%x state 0x%x " \
+ "s_acked_ack_queue %u s_tail_ack_queue %u " \
+ "r_head_ack_queue %u opcode 0x%x psn 0x%x r_psn 0x%x " \
+ " diff %d"
+
+#define TID_WRITE_RSPDR_PRN "[%s] qpn 0x%x r_tid_head %u r_tid_tail %u " \
+ "r_tid_ack %u r_tid_alloc %u alloc_w_segs %u " \
+ "pending_tid_w_segs %u sync_pt %s " \
+ "ps_nak_psn 0x%x ps_nak_state 0x%x " \
+ "prnr_nak_state 0x%x hw_flow_index %u generation "\
+ "0x%x fpsn 0x%x resync %s" \
+ "r_next_psn_kdeth 0x%x"
+
+#define TID_WRITE_SENDER_PRN "[%s] qpn 0x%x newreq %u s_tid_cur %u " \
+ "s_tid_tail %u s_tid_head %u " \
+ "pending_tid_w_resp %u n_requests %u " \
+ "n_tid_requests %u s_flags 0x%x ps_flags 0x%x "\
+ "iow_flags 0x%lx s_state 0x%x s_retry %u"
+
+#define KDETH_EFLAGS_ERR_PRN "[%s] qpn 0x%x TID ERR: RcvType 0x%x " \
+ "RcvTypeError 0x%x PSN 0x%x"
+
+DECLARE_EVENT_CLASS(/* class */
+ hfi1_exp_tid_reg_unreg,
+ TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+ unsigned long va, unsigned long pa, dma_addr_t dma),
+ TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma),
+ TP_STRUCT__entry(/* entry */
+ __field(unsigned int, ctxt)
+ __field(u16, subctxt)
+ __field(u32, rarr)
+ __field(u32, npages)
+ __field(unsigned long, va)
+ __field(unsigned long, pa)
+ __field(dma_addr_t, dma)
+ ),
+ TP_fast_assign(/* assign */
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->rarr = rarr;
+ __entry->npages = npages;
+ __entry->va = va;
+ __entry->pa = pa;
+ __entry->dma = dma;
+ ),
+ TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx",
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->rarr,
+ __entry->npages,
+ __entry->pa,
+ __entry->va,
+ __entry->dma
+ )
+);
+
+DEFINE_EVENT(/* exp_tid_unreg */
+ hfi1_exp_tid_reg_unreg, hfi1_exp_tid_unreg,
+ TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+ unsigned long va, unsigned long pa, dma_addr_t dma),
+ TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma)
+);
+
+DEFINE_EVENT(/* exp_tid_reg */
+ hfi1_exp_tid_reg_unreg, hfi1_exp_tid_reg,
+ TP_PROTO(unsigned int ctxt, u16 subctxt, u32 rarr, u32 npages,
+ unsigned long va, unsigned long pa, dma_addr_t dma),
+ TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma)
+);
+
+TRACE_EVENT(/* put_tid */
+ hfi1_put_tid,
+ TP_PROTO(struct hfi1_devdata *dd,
+ u32 index, u32 type, unsigned long pa, u16 order),
+ TP_ARGS(dd, index, type, pa, order),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd)
+ __field(unsigned long, pa);
+ __field(u32, index);
+ __field(u32, type);
+ __field(u16, order);
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd);
+ __entry->pa = pa;
+ __entry->index = index;
+ __entry->type = type;
+ __entry->order = order;
+ ),
+ TP_printk("[%s] type %s pa %lx index %u order %u",
+ __get_str(dev),
+ show_tidtype(__entry->type),
+ __entry->pa,
+ __entry->index,
+ __entry->order
+ )
+);
+
+TRACE_EVENT(/* exp_tid_inval */
+ hfi1_exp_tid_inval,
+ TP_PROTO(unsigned int ctxt, u16 subctxt, unsigned long va, u32 rarr,
+ u32 npages, dma_addr_t dma),
+ TP_ARGS(ctxt, subctxt, va, rarr, npages, dma),
+ TP_STRUCT__entry(/* entry */
+ __field(unsigned int, ctxt)
+ __field(u16, subctxt)
+ __field(unsigned long, va)
+ __field(u32, rarr)
+ __field(u32, npages)
+ __field(dma_addr_t, dma)
+ ),
+ TP_fast_assign(/* assign */
+ __entry->ctxt = ctxt;
+ __entry->subctxt = subctxt;
+ __entry->va = va;
+ __entry->rarr = rarr;
+ __entry->npages = npages;
+ __entry->dma = dma;
+ ),
+ TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx",
+ __entry->ctxt,
+ __entry->subctxt,
+ __entry->rarr,
+ __entry->npages,
+ __entry->va,
+ __entry->dma
+ )
+);
+
+DECLARE_EVENT_CLASS(/* opfn_state */
+ hfi1_opfn_state_template,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u16, requested)
+ __field(u16, completed)
+ __field(u8, curr)
+ ),
+ TP_fast_assign(/* assign */
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->requested = priv->opfn.requested;
+ __entry->completed = priv->opfn.completed;
+ __entry->curr = priv->opfn.curr;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x requested 0x%x completed 0x%x curr 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->requested,
+ __entry->completed,
+ __entry->curr
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_state_template, hfi1_opfn_state_conn_request,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_state_template, hfi1_opfn_state_sched_conn_request,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_state_template, hfi1_opfn_state_conn_response,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_state_template, hfi1_opfn_state_conn_reply,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_state_template, hfi1_opfn_state_conn_error,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DECLARE_EVENT_CLASS(/* opfn_data */
+ hfi1_opfn_data_template,
+ TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
+ TP_ARGS(qp, capcode, data),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, state)
+ __field(u8, capcode)
+ __field(u64, data)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->state = qp->state;
+ __entry->capcode = capcode;
+ __entry->data = data;
+ ),
+ TP_printk(/* printk */
+ "[%s] qpn 0x%x (state 0x%x) Capcode %u data 0x%llx",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->state,
+ __entry->capcode,
+ __entry->data
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_data_template, hfi1_opfn_data_conn_request,
+ TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
+ TP_ARGS(qp, capcode, data)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_data_template, hfi1_opfn_data_conn_response,
+ TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
+ TP_ARGS(qp, capcode, data)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_data_template, hfi1_opfn_data_conn_reply,
+ TP_PROTO(struct rvt_qp *qp, u8 capcode, u64 data),
+ TP_ARGS(qp, capcode, data)
+);
+
+DECLARE_EVENT_CLASS(/* opfn_param */
+ hfi1_opfn_param_template,
+ TP_PROTO(struct rvt_qp *qp, char remote,
+ struct tid_rdma_params *param),
+ TP_ARGS(qp, remote, param),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(char, remote)
+ __field(u32, param_qp)
+ __field(u32, max_len)
+ __field(u16, jkey)
+ __field(u8, max_read)
+ __field(u8, max_write)
+ __field(u8, timeout)
+ __field(u8, urg)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->remote = remote;
+ __entry->param_qp = param->qp;
+ __entry->max_len = param->max_len;
+ __entry->jkey = param->jkey;
+ __entry->max_read = param->max_read;
+ __entry->max_write = param->max_write;
+ __entry->timeout = param->timeout;
+ __entry->urg = param->urg;
+ ),
+ TP_printk(/* print */
+ OPFN_PARAM_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->remote ? "remote" : "local",
+ __entry->param_qp,
+ __entry->max_read,
+ __entry->max_write,
+ __entry->max_len,
+ __entry->jkey,
+ __entry->timeout,
+ __entry->urg
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_opfn_param_template, hfi1_opfn_param,
+ TP_PROTO(struct rvt_qp *qp, char remote,
+ struct tid_rdma_params *param),
+ TP_ARGS(qp, remote, param)
+);
+
+DECLARE_EVENT_CLASS(/* msg */
+ hfi1_msg_template,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more),
+ TP_STRUCT__entry(/* entry */
+ __field(u32, qpn)
+ __string(msg, msg)
+ __field(u64, more)
+ ),
+ TP_fast_assign(/* assign */
+ __entry->qpn = qp ? qp->ibqp.qp_num : 0;
+ __assign_str(msg, msg);
+ __entry->more = more;
+ ),
+ TP_printk(/* print */
+ "qpn 0x%x %s 0x%llx",
+ __entry->qpn,
+ __get_str(msg),
+ __entry->more
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_opfn_conn_request,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_opfn_conn_error,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_alloc_tids,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_tid_restart_req,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_handle_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_tid_timeout,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_msg_template, hfi1_msg_tid_retry_timeout,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u64 more),
+ TP_ARGS(qp, msg, more)
+);
+
+DECLARE_EVENT_CLASS(/* tid_flow_page */
+ hfi1_tid_flow_page_template,
+ TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index,
+ char mtu8k, char v1, void *vaddr),
+ TP_ARGS(qp, flow, index, mtu8k, v1, vaddr),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(char, mtu8k)
+ __field(char, v1)
+ __field(u32, index)
+ __field(u64, page)
+ __field(u64, vaddr)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->mtu8k = mtu8k;
+ __entry->v1 = v1;
+ __entry->index = index;
+ __entry->page = vaddr ? (u64)virt_to_page(vaddr) : 0ULL;
+ __entry->vaddr = (u64)vaddr;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x page[%u]: page 0x%llx %s 0x%llx",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->index,
+ __entry->page,
+ __entry->mtu8k ? (__entry->v1 ? "v1" : "v0") : "vaddr",
+ __entry->vaddr
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_page_template, hfi1_tid_flow_page,
+ TP_PROTO(struct rvt_qp *qp, struct tid_rdma_flow *flow, u32 index,
+ char mtu8k, char v1, void *vaddr),
+ TP_ARGS(qp, flow, index, mtu8k, v1, vaddr)
+);
+
+DECLARE_EVENT_CLASS(/* tid_pageset */
+ hfi1_tid_pageset_template,
+ TP_PROTO(struct rvt_qp *qp, u32 index, u16 idx, u16 count),
+ TP_ARGS(qp, index, idx, count),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, index)
+ __field(u16, idx)
+ __field(u16, count)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->index = index;
+ __entry->idx = idx;
+ __entry->count = count;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x list[%u]: idx %u count %u",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->index,
+ __entry->idx,
+ __entry->count
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_pageset_template, hfi1_tid_pageset,
+ TP_PROTO(struct rvt_qp *qp, u32 index, u16 idx, u16 count),
+ TP_ARGS(qp, index, idx, count)
+);
+
+DECLARE_EVENT_CLASS(/* tid_fow */
+ hfi1_tid_flow_template,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(int, index)
+ __field(int, idx)
+ __field(u32, resp_ib_psn)
+ __field(u32, generation)
+ __field(u32, fspsn)
+ __field(u32, flpsn)
+ __field(u32, r_next_psn)
+ __field(u32, ib_spsn)
+ __field(u32, ib_lpsn)
+ __field(u32, npagesets)
+ __field(u32, tnode_cnt)
+ __field(u32, tidcnt)
+ __field(u32, tid_idx)
+ __field(u32, tid_offset)
+ __field(u32, length)
+ __field(u32, sent)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->index = index;
+ __entry->idx = flow->idx;
+ __entry->resp_ib_psn = flow->flow_state.resp_ib_psn;
+ __entry->generation = flow->flow_state.generation;
+ __entry->fspsn = full_flow_psn(flow,
+ flow->flow_state.spsn);
+ __entry->flpsn = full_flow_psn(flow,
+ flow->flow_state.lpsn);
+ __entry->r_next_psn = flow->flow_state.r_next_psn;
+ __entry->ib_spsn = flow->flow_state.ib_spsn;
+ __entry->ib_lpsn = flow->flow_state.ib_lpsn;
+ __entry->npagesets = flow->npagesets;
+ __entry->tnode_cnt = flow->tnode_cnt;
+ __entry->tidcnt = flow->tidcnt;
+ __entry->tid_idx = flow->tid_idx;
+ __entry->tid_offset = flow->tid_offset;
+ __entry->length = flow->length;
+ __entry->sent = flow->sent;
+ ),
+ TP_printk(/* print */
+ TID_FLOW_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->index,
+ __entry->idx,
+ __entry->resp_ib_psn,
+ __entry->generation,
+ __entry->fspsn,
+ __entry->flpsn,
+ __entry->r_next_psn,
+ __entry->ib_spsn,
+ __entry->ib_lpsn,
+ __entry->npagesets,
+ __entry->tnode_cnt,
+ __entry->tidcnt,
+ __entry->tid_idx,
+ __entry->tid_offset,
+ __entry->length,
+ __entry->sent
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_alloc,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_build_read_pkt,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_build_read_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_req,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_rcv_read_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_restart_req,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_build_write_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_rcv_write_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_build_write_data,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_rcv_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_rcv_resync,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_handle_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_flow_template, hfi1_tid_flow_read_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, int index, struct tid_rdma_flow *flow),
+ TP_ARGS(qp, index, flow)
+);
+
+DECLARE_EVENT_CLASS(/* tid_node */
+ hfi1_tid_node_template,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base,
+ u8 map, u8 used, u8 cnt),
+ TP_ARGS(qp, msg, index, base, map, used, cnt),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __string(msg, msg)
+ __field(u32, index)
+ __field(u32, base)
+ __field(u8, map)
+ __field(u8, used)
+ __field(u8, cnt)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __assign_str(msg, msg);
+ __entry->index = index;
+ __entry->base = base;
+ __entry->map = map;
+ __entry->used = used;
+ __entry->cnt = cnt;
+ ),
+ TP_printk(/* print */
+ TID_NODE_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __get_str(msg),
+ __entry->index,
+ __entry->base,
+ __entry->map,
+ __entry->used,
+ __entry->cnt
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_node_template, hfi1_tid_node_add,
+ TP_PROTO(struct rvt_qp *qp, const char *msg, u32 index, u32 base,
+ u8 map, u8 used, u8 cnt),
+ TP_ARGS(qp, msg, index, base, map, used, cnt)
+);
+
+DECLARE_EVENT_CLASS(/* tid_entry */
+ hfi1_tid_entry_template,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 ent),
+ TP_ARGS(qp, index, ent),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(int, index)
+ __field(u8, ctrl)
+ __field(u16, idx)
+ __field(u16, len)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->index = index;
+ __entry->ctrl = hfi1_trace_get_tid_ctrl(ent);
+ __entry->idx = hfi1_trace_get_tid_idx(ent);
+ __entry->len = hfi1_trace_get_tid_len(ent);
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x TID entry %d: idx %u len %u ctrl 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->index,
+ __entry->idx,
+ __entry->len,
+ __entry->ctrl
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_entry_template, hfi1_tid_entry_alloc,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 entry),
+ TP_ARGS(qp, index, entry)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_entry_template, hfi1_tid_entry_build_read_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 ent),
+ TP_ARGS(qp, index, ent)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_entry_template, hfi1_tid_entry_rcv_read_req,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 ent),
+ TP_ARGS(qp, index, ent)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_entry_template, hfi1_tid_entry_rcv_write_resp,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 entry),
+ TP_ARGS(qp, index, entry)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_entry_template, hfi1_tid_entry_build_write_data,
+ TP_PROTO(struct rvt_qp *qp, int index, u32 entry),
+ TP_ARGS(qp, index, entry)
+);
+
+DECLARE_EVENT_CLASS(/* rsp_info */
+ hfi1_responder_info_template,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u8, state)
+ __field(u8, s_state)
+ __field(u32, psn)
+ __field(u32, r_psn)
+ __field(u8, r_state)
+ __field(u8, r_flags)
+ __field(u8, r_head_ack_queue)
+ __field(u8, s_tail_ack_queue)
+ __field(u8, s_acked_ack_queue)
+ __field(u8, s_ack_state)
+ __field(u8, s_nak_state)
+ __field(u8, r_nak_state)
+ __field(u32, s_flags)
+ __field(u32, ps_flags)
+ __field(unsigned long, iow_flags)
+ ),
+ TP_fast_assign(/* assign */
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->state = qp->state;
+ __entry->s_state = qp->s_state;
+ __entry->psn = psn;
+ __entry->r_psn = qp->r_psn;
+ __entry->r_state = qp->r_state;
+ __entry->r_flags = qp->r_flags;
+ __entry->r_head_ack_queue = qp->r_head_ack_queue;
+ __entry->s_tail_ack_queue = qp->s_tail_ack_queue;
+ __entry->s_acked_ack_queue = qp->s_acked_ack_queue;
+ __entry->s_ack_state = qp->s_ack_state;
+ __entry->s_nak_state = qp->s_nak_state;
+ __entry->s_flags = qp->s_flags;
+ __entry->ps_flags = priv->s_flags;
+ __entry->iow_flags = priv->s_iowait.flags;
+ ),
+ TP_printk(/* print */
+ RSP_INFO_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->state,
+ __entry->s_state,
+ __entry->psn,
+ __entry->r_psn,
+ __entry->r_state,
+ __entry->r_flags,
+ __entry->r_head_ack_queue,
+ __entry->s_tail_ack_queue,
+ __entry->s_acked_ack_queue,
+ __entry->s_ack_state,
+ __entry->s_nak_state,
+ __entry->s_flags,
+ __entry->ps_flags,
+ __entry->iow_flags
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_make_rc_ack,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_rcv_tid_read_req,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_tid_rcv_error,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_tid_write_alloc_res,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_req,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_build_tid_write_resp,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_rcv_tid_write_data,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_make_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_handle_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_responder_info_template, hfi1_rsp_read_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, u32 psn),
+ TP_ARGS(qp, psn)
+);
+
+DECLARE_EVENT_CLASS(/* sender_info */
+ hfi1_sender_info_template,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u8, state)
+ __field(u32, s_cur)
+ __field(u32, s_tail)
+ __field(u32, s_head)
+ __field(u32, s_acked)
+ __field(u32, s_last)
+ __field(u32, s_psn)
+ __field(u32, s_last_psn)
+ __field(u32, s_flags)
+ __field(u32, ps_flags)
+ __field(unsigned long, iow_flags)
+ __field(u8, s_state)
+ __field(u8, s_num_rd)
+ __field(u8, s_retry)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->state = qp->state;
+ __entry->s_cur = qp->s_cur;
+ __entry->s_tail = qp->s_tail;
+ __entry->s_head = qp->s_head;
+ __entry->s_acked = qp->s_acked;
+ __entry->s_last = qp->s_last;
+ __entry->s_psn = qp->s_psn;
+ __entry->s_last_psn = qp->s_last_psn;
+ __entry->s_flags = qp->s_flags;
+ __entry->ps_flags = ((struct hfi1_qp_priv *)qp->priv)->s_flags;
+ __entry->iow_flags =
+ ((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags;
+ __entry->s_state = qp->s_state;
+ __entry->s_num_rd = qp->s_num_rd_atomic;
+ __entry->s_retry = qp->s_retry;
+ ),
+ TP_printk(/* print */
+ SENDER_INFO_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->state,
+ __entry->s_cur,
+ __entry->s_tail,
+ __entry->s_head,
+ __entry->s_acked,
+ __entry->s_last,
+ __entry->s_psn,
+ __entry->s_last_psn,
+ __entry->s_flags,
+ __entry->ps_flags,
+ __entry->iow_flags,
+ __entry->s_state,
+ __entry->s_num_rd,
+ __entry->s_retry
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_make_rc_req,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_reset_psn,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_restart_rc,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_do_rc_ack,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_rcv_tid_read_resp,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_rcv_tid_ack,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_make_tid_pkt,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sender_info_template, hfi1_sender_read_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DECLARE_EVENT_CLASS(/* tid_read_sender */
+ hfi1_tid_read_sender_template,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(char, newreq)
+ __field(u32, tid_r_reqs)
+ __field(u32, tid_r_comp)
+ __field(u32, pending_tid_r_segs)
+ __field(u32, s_flags)
+ __field(u32, ps_flags)
+ __field(unsigned long, iow_flags)
+ __field(u8, s_state)
+ __field(u32, hw_flow_index)
+ __field(u32, generation)
+ __field(u32, fpsn)
+ ),
+ TP_fast_assign(/* assign */
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->newreq = newreq;
+ __entry->tid_r_reqs = priv->tid_r_reqs;
+ __entry->tid_r_comp = priv->tid_r_comp;
+ __entry->pending_tid_r_segs = priv->pending_tid_r_segs;
+ __entry->s_flags = qp->s_flags;
+ __entry->ps_flags = priv->s_flags;
+ __entry->iow_flags = priv->s_iowait.flags;
+ __entry->s_state = priv->s_state;
+ __entry->hw_flow_index = priv->flow_state.index;
+ __entry->generation = priv->flow_state.generation;
+ __entry->fpsn = priv->flow_state.psn;
+ ),
+ TP_printk(/* print */
+ TID_READ_SENDER_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->newreq,
+ __entry->tid_r_reqs,
+ __entry->tid_r_comp,
+ __entry->pending_tid_r_segs,
+ __entry->s_flags,
+ __entry->ps_flags,
+ __entry->iow_flags,
+ __entry->s_state,
+ __entry->hw_flow_index,
+ __entry->generation,
+ __entry->fpsn
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_read_sender_template, hfi1_tid_read_sender_make_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_read_sender_template, hfi1_tid_read_sender_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DECLARE_EVENT_CLASS(/* tid_rdma_request */
+ hfi1_tid_rdma_request_template,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(char, newreq)
+ __field(u8, opcode)
+ __field(u32, psn)
+ __field(u32, lpsn)
+ __field(u32, cur_seg)
+ __field(u32, comp_seg)
+ __field(u32, ack_seg)
+ __field(u32, alloc_seg)
+ __field(u32, total_segs)
+ __field(u16, setup_head)
+ __field(u16, clear_tail)
+ __field(u16, flow_idx)
+ __field(u16, acked_tail)
+ __field(u32, state)
+ __field(u32, r_ack_psn)
+ __field(u32, r_flow_psn)
+ __field(u32, r_last_acked)
+ __field(u32, s_next_psn)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->newreq = newreq;
+ __entry->opcode = opcode;
+ __entry->psn = psn;
+ __entry->lpsn = lpsn;
+ __entry->cur_seg = req->cur_seg;
+ __entry->comp_seg = req->comp_seg;
+ __entry->ack_seg = req->ack_seg;
+ __entry->alloc_seg = req->alloc_seg;
+ __entry->total_segs = req->total_segs;
+ __entry->setup_head = req->setup_head;
+ __entry->clear_tail = req->clear_tail;
+ __entry->flow_idx = req->flow_idx;
+ __entry->acked_tail = req->acked_tail;
+ __entry->state = req->state;
+ __entry->r_ack_psn = req->r_ack_psn;
+ __entry->r_flow_psn = req->r_flow_psn;
+ __entry->r_last_acked = req->r_last_acked;
+ __entry->s_next_psn = req->s_next_psn;
+ ),
+ TP_printk(/* print */
+ TID_REQ_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->newreq,
+ __entry->opcode,
+ __entry->psn,
+ __entry->lpsn,
+ __entry->cur_seg,
+ __entry->comp_seg,
+ __entry->ack_seg,
+ __entry->alloc_seg,
+ __entry->total_segs,
+ __entry->setup_head,
+ __entry->clear_tail,
+ __entry->flow_idx,
+ __entry->acked_tail,
+ __entry->state,
+ __entry->r_ack_psn,
+ __entry->r_flow_psn,
+ __entry->r_last_acked,
+ __entry->s_next_psn
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_read,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_build_read_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_read_resp,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_err,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_restart_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_setup_tid_wqe,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_write_alloc_res,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_build_write_resp,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_resp,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_write_data,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_tid_retry_timeout,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_rcv_resync,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_pkt,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_make_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_handle_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_read_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_make_rc_ack_write,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_make_req_write,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_rdma_request_template, hfi1_tid_req_update_num_rd_atomic,
+ TP_PROTO(struct rvt_qp *qp, char newreq, u8 opcode, u32 psn, u32 lpsn,
+ struct tid_rdma_request *req),
+ TP_ARGS(qp, newreq, opcode, psn, lpsn, req)
+);
+
+DECLARE_EVENT_CLASS(/* rc_rcv_err */
+ hfi1_rc_rcv_err_template,
+ TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff),
+ TP_ARGS(qp, opcode, psn, diff),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, s_flags)
+ __field(u8, state)
+ __field(u8, s_acked_ack_queue)
+ __field(u8, s_tail_ack_queue)
+ __field(u8, r_head_ack_queue)
+ __field(u32, opcode)
+ __field(u32, psn)
+ __field(u32, r_psn)
+ __field(int, diff)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->s_flags = qp->s_flags;
+ __entry->state = qp->state;
+ __entry->s_acked_ack_queue = qp->s_acked_ack_queue;
+ __entry->s_tail_ack_queue = qp->s_tail_ack_queue;
+ __entry->r_head_ack_queue = qp->r_head_ack_queue;
+ __entry->opcode = opcode;
+ __entry->psn = psn;
+ __entry->r_psn = qp->r_psn;
+ __entry->diff = diff;
+ ),
+ TP_printk(/* print */
+ RCV_ERR_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->s_flags,
+ __entry->state,
+ __entry->s_acked_ack_queue,
+ __entry->s_tail_ack_queue,
+ __entry->r_head_ack_queue,
+ __entry->opcode,
+ __entry->psn,
+ __entry->r_psn,
+ __entry->diff
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_rc_rcv_err_template, hfi1_tid_rdma_rcv_err,
+ TP_PROTO(struct rvt_qp *qp, u32 opcode, u32 psn, int diff),
+ TP_ARGS(qp, opcode, psn, diff)
+);
+
+DECLARE_EVENT_CLASS(/* sge */
+ hfi1_sge_template,
+ TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge),
+ TP_ARGS(qp, index, sge),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(int, index)
+ __field(u64, vaddr)
+ __field(u32, sge_length)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->index = index;
+ __entry->vaddr = (u64)sge->vaddr;
+ __entry->sge_length = sge->sge_length;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x sge %d: vaddr 0x%llx sge_length %u",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->index,
+ __entry->vaddr,
+ __entry->sge_length
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_sge_template, hfi1_sge_check_align,
+ TP_PROTO(struct rvt_qp *qp, int index, struct rvt_sge *sge),
+ TP_ARGS(qp, index, sge)
+);
+
+DECLARE_EVENT_CLASS(/* tid_write_sp */
+ hfi1_tid_write_rsp_template,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, r_tid_head)
+ __field(u32, r_tid_tail)
+ __field(u32, r_tid_ack)
+ __field(u32, r_tid_alloc)
+ __field(u32, alloc_w_segs)
+ __field(u32, pending_tid_w_segs)
+ __field(bool, sync_pt)
+ __field(u32, ps_nak_psn)
+ __field(u8, ps_nak_state)
+ __field(u8, prnr_nak_state)
+ __field(u32, hw_flow_index)
+ __field(u32, generation)
+ __field(u32, fpsn)
+ __field(bool, resync)
+ __field(u32, r_next_psn_kdeth)
+ ),
+ TP_fast_assign(/* assign */
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->r_tid_head = priv->r_tid_head;
+ __entry->r_tid_tail = priv->r_tid_tail;
+ __entry->r_tid_ack = priv->r_tid_ack;
+ __entry->r_tid_alloc = priv->r_tid_alloc;
+ __entry->alloc_w_segs = priv->alloc_w_segs;
+ __entry->pending_tid_w_segs = priv->pending_tid_w_segs;
+ __entry->sync_pt = priv->sync_pt;
+ __entry->ps_nak_psn = priv->s_nak_psn;
+ __entry->ps_nak_state = priv->s_nak_state;
+ __entry->prnr_nak_state = priv->rnr_nak_state;
+ __entry->hw_flow_index = priv->flow_state.index;
+ __entry->generation = priv->flow_state.generation;
+ __entry->fpsn = priv->flow_state.psn;
+ __entry->resync = priv->resync;
+ __entry->r_next_psn_kdeth = priv->r_next_psn_kdeth;
+ ),
+ TP_printk(/* print */
+ TID_WRITE_RSPDR_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->r_tid_head,
+ __entry->r_tid_tail,
+ __entry->r_tid_ack,
+ __entry->r_tid_alloc,
+ __entry->alloc_w_segs,
+ __entry->pending_tid_w_segs,
+ __entry->sync_pt ? "yes" : "no",
+ __entry->ps_nak_psn,
+ __entry->ps_nak_state,
+ __entry->prnr_nak_state,
+ __entry->hw_flow_index,
+ __entry->generation,
+ __entry->fpsn,
+ __entry->resync ? "yes" : "no",
+ __entry->r_next_psn_kdeth
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_alloc_res,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_req,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_build_resp,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_data,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_rcv_resync,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_tid_ack,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_handle_kdeth_eflags,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_rsp_template, hfi1_tid_write_rsp_make_rc_ack,
+ TP_PROTO(struct rvt_qp *qp),
+ TP_ARGS(qp)
+);
+
+DECLARE_EVENT_CLASS(/* tid_write_sender */
+ hfi1_tid_write_sender_template,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(char, newreq)
+ __field(u32, s_tid_cur)
+ __field(u32, s_tid_tail)
+ __field(u32, s_tid_head)
+ __field(u32, pending_tid_w_resp)
+ __field(u32, n_requests)
+ __field(u32, n_tid_requests)
+ __field(u32, s_flags)
+ __field(u32, ps_flags)
+ __field(unsigned long, iow_flags)
+ __field(u8, s_state)
+ __field(u8, s_retry)
+ ),
+ TP_fast_assign(/* assign */
+ struct hfi1_qp_priv *priv = qp->priv;
+
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->newreq = newreq;
+ __entry->s_tid_cur = priv->s_tid_cur;
+ __entry->s_tid_tail = priv->s_tid_tail;
+ __entry->s_tid_head = priv->s_tid_head;
+ __entry->pending_tid_w_resp = priv->pending_tid_w_resp;
+ __entry->n_requests = atomic_read(&priv->n_requests);
+ __entry->n_tid_requests = atomic_read(&priv->n_tid_requests);
+ __entry->s_flags = qp->s_flags;
+ __entry->ps_flags = priv->s_flags;
+ __entry->iow_flags = priv->s_iowait.flags;
+ __entry->s_state = priv->s_state;
+ __entry->s_retry = priv->s_retry;
+ ),
+ TP_printk(/* print */
+ TID_WRITE_SENDER_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->newreq,
+ __entry->s_tid_cur,
+ __entry->s_tid_tail,
+ __entry->s_tid_head,
+ __entry->pending_tid_w_resp,
+ __entry->n_requests,
+ __entry->n_tid_requests,
+ __entry->s_flags,
+ __entry->ps_flags,
+ __entry->iow_flags,
+ __entry->s_state,
+ __entry->s_retry
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_resp,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_rcv_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_retry_timeout,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_tid_pkt,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_make_req,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_tid_write_sender_template, hfi1_tid_write_sender_restart_rc,
+ TP_PROTO(struct rvt_qp *qp, char newreq),
+ TP_ARGS(qp, newreq)
+);
+
+DECLARE_EVENT_CLASS(/* tid_ack */
+ hfi1_tid_ack_template,
+ TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+ u32 req_psn, u32 resync_psn),
+ TP_ARGS(qp, aeth, psn, req_psn, resync_psn),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u32, aeth)
+ __field(u32, psn)
+ __field(u32, req_psn)
+ __field(u32, resync_psn)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->aeth = aeth;
+ __entry->psn = psn;
+ __entry->req_psn = req_psn;
+ __entry->resync_psn = resync_psn;
+ ),
+ TP_printk(/* print */
+ "[%s] qpn 0x%x aeth 0x%x psn 0x%x req_psn 0x%x resync_psn 0x%x",
+ __get_str(dev),
+ __entry->qpn,
+ __entry->aeth,
+ __entry->psn,
+ __entry->req_psn,
+ __entry->resync_psn
+ )
+);
+
+DEFINE_EVENT(/* rcv_tid_ack */
+ hfi1_tid_ack_template, hfi1_rcv_tid_ack,
+ TP_PROTO(struct rvt_qp *qp, u32 aeth, u32 psn,
+ u32 req_psn, u32 resync_psn),
+ TP_ARGS(qp, aeth, psn, req_psn, resync_psn)
+);
+
+DECLARE_EVENT_CLASS(/* kdeth_eflags_error */
+ hfi1_kdeth_eflags_error_template,
+ TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn),
+ TP_ARGS(qp, rcv_type, rte, psn),
+ TP_STRUCT__entry(/* entry */
+ DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device))
+ __field(u32, qpn)
+ __field(u8, rcv_type)
+ __field(u8, rte)
+ __field(u32, psn)
+ ),
+ TP_fast_assign(/* assign */
+ DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device));
+ __entry->qpn = qp->ibqp.qp_num;
+ __entry->rcv_type = rcv_type;
+ __entry->rte = rte;
+ __entry->psn = psn;
+ ),
+ TP_printk(/* print */
+ KDETH_EFLAGS_ERR_PRN,
+ __get_str(dev),
+ __entry->qpn,
+ __entry->rcv_type,
+ __entry->rte,
+ __entry->psn
+ )
+);
+
+DEFINE_EVENT(/* event */
+ hfi1_kdeth_eflags_error_template, hfi1_eflags_err_write,
+ TP_PROTO(struct rvt_qp *qp, u8 rcv_type, u8 rte, u32 psn),
+ TP_ARGS(qp, rcv_type, rte, psn)
+);
+
+#endif /* __HFI1_TRACE_TID_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_tid
+#include <trace/define_trace.h>
diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h
index c57af3b..09eb0c9 100644
--- a/drivers/infiniband/hw/hfi1/trace_tx.h
+++ b/drivers/infiniband/hw/hfi1/trace_tx.h
@@ -114,19 +114,27 @@
__field(u32, qpn)
__field(u32, flags)
__field(u32, s_flags)
+ __field(u32, ps_flags)
+ __field(unsigned long, iow_flags)
),
TP_fast_assign(
DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device))
__entry->flags = flags;
__entry->qpn = qp->ibqp.qp_num;
__entry->s_flags = qp->s_flags;
+ __entry->ps_flags =
+ ((struct hfi1_qp_priv *)qp->priv)->s_flags;
+ __entry->iow_flags =
+ ((struct hfi1_qp_priv *)qp->priv)->s_iowait.flags;
),
TP_printk(
- "[%s] qpn 0x%x flags 0x%x s_flags 0x%x",
+ "[%s] qpn 0x%x flags 0x%x s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx",
__get_str(dev),
__entry->qpn,
__entry->flags,
- __entry->s_flags
+ __entry->s_flags,
+ __entry->ps_flags,
+ __entry->iow_flags
)
);
@@ -838,6 +846,12 @@
TP_ARGS(qp, flag)
);
+DEFINE_EVENT(/* event */
+ hfi1_do_send_template, hfi1_rc_do_tid_send,
+ TP_PROTO(struct rvt_qp *qp, bool flag),
+ TP_ARGS(qp, flag)
+);
+
DEFINE_EVENT(
hfi1_do_send_template, hfi1_rc_expired_time_slice,
TP_PROTO(struct rvt_qp *qp, bool flag),
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
index e254dce..0c77f18 100644
--- a/drivers/infiniband/hw/hfi1/uc.c
+++ b/drivers/infiniband/hw/hfi1/uc.c
@@ -88,7 +88,7 @@
}
clear_ahg(qp);
wqe = rvt_get_swqe_ptr(qp, qp->s_last);
- hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+ rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
goto done_free_tx;
}
@@ -140,7 +140,7 @@
qp, wqe->wr.ex.invalidate_rkey);
local_ops = 1;
}
- hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR
+ rvt_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR
: IB_WC_SUCCESS);
if (local_ops)
atomic_dec(&qp->local_ops_pending);
@@ -271,7 +271,8 @@
ps->s_txreq->ss = &qp->s_sge;
ps->s_txreq->s_cur_size = len;
hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
- mask_psn(qp->s_psn++), middle, ps);
+ qp->remote_qpn, mask_psn(qp->s_psn++),
+ middle, ps);
return 1;
done_free_tx:
@@ -321,7 +322,7 @@
if (hfi1_ruc_check_hdr(ibp, packet))
return;
- process_ecn(qp, packet, true);
+ process_ecn(qp, packet);
psn = ib_bth_get_psn(ohdr);
/* Compare the PSN verses the expected PSN. */
@@ -426,7 +427,7 @@
qp->r_rcv_len += pmtu;
if (unlikely(qp->r_rcv_len > qp->r_len))
goto rewind;
- hfi1_copy_sge(&qp->r_sge, data, pmtu, false, false);
+ rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false);
break;
case OP(SEND_LAST_WITH_IMMEDIATE):
@@ -449,7 +450,7 @@
if (unlikely(wc.byte_len > qp->r_len))
goto rewind;
wc.opcode = IB_WC_RECV;
- hfi1_copy_sge(&qp->r_sge, data, tlen, false, false);
+ rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false);
rvt_put_ss(&qp->s_rdma_read_sge);
last_imm:
wc.wr_id = qp->r_wr_id;
@@ -475,8 +476,7 @@
wc.dlid_path_bits = 0;
wc.port_num = 0;
/* Signal completion event if the solicited bit is set. */
- rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
- ib_bth_is_solicited(ohdr));
+ rvt_recv_cq(qp, &wc, ib_bth_is_solicited(ohdr));
break;
case OP(RDMA_WRITE_FIRST):
@@ -523,7 +523,7 @@
qp->r_rcv_len += pmtu;
if (unlikely(qp->r_rcv_len > qp->r_len))
goto drop;
- hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false);
+ rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false);
break;
case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
@@ -550,7 +550,7 @@
}
wc.byte_len = qp->r_len;
wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
- hfi1_copy_sge(&qp->r_sge, data, tlen, true, false);
+ rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false);
rvt_put_ss(&qp->r_sge);
goto last_imm;
@@ -564,7 +564,7 @@
tlen -= (hdrsize + extra_bytes);
if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
goto drop;
- hfi1_copy_sge(&qp->r_sge, data, tlen, true, false);
+ rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false);
rvt_put_ss(&qp->r_sge);
break;
diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c
index 70d39fc..e804af7 100644
--- a/drivers/infiniband/hw/hfi1/ud.c
+++ b/drivers/infiniband/hw/hfi1/ud.c
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2019 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -51,6 +51,7 @@
#include "hfi.h"
#include "mad.h"
#include "verbs_txreq.h"
+#include "trace_ibhdrs.h"
#include "qp.h"
/* We support only two types - 9B and 16B for now */
@@ -86,7 +87,7 @@
rcu_read_lock();
qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
- swqe->ud_wr.remote_qpn);
+ rvt_get_swqe_remote_qpn(swqe));
if (!qp) {
ibp->rvp.n_pkt_drops++;
rcu_read_unlock();
@@ -104,7 +105,7 @@
goto drop;
}
- ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr;
+ ah_attr = rvt_get_swqe_ah_attr(swqe);
ppd = ppd_from_ibp(ibp);
if (qp->ibqp.qp_num > 1) {
@@ -134,8 +135,8 @@
if (qp->ibqp.qp_num) {
u32 qkey;
- qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
- sqp->qkey : swqe->ud_wr.remote_qkey;
+ qkey = (int)rvt_get_swqe_remote_qkey(swqe) < 0 ?
+ sqp->qkey : rvt_get_swqe_remote_qkey(swqe);
if (unlikely(qkey != qp->qkey))
goto drop; /* silently drop per IBTA spec */
}
@@ -210,8 +211,8 @@
}
hfi1_make_grh(ibp, &grh, &grd, 0, 0);
- hfi1_copy_sge(&qp->r_sge, &grh,
- sizeof(grh), true, false);
+ rvt_copy_sge(qp, &qp->r_sge, &grh,
+ sizeof(grh), true, false);
wc.wc_flags |= IB_WC_GRH;
} else {
rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true);
@@ -221,31 +222,11 @@
ssge.num_sge = swqe->wr.num_sge;
sge = &ssge.sge;
while (length) {
- u32 len = sge->length;
+ u32 len = rvt_get_sge_length(sge, length);
- if (len > length)
- len = length;
- if (len > sge->sge_length)
- len = sge->sge_length;
WARN_ON_ONCE(len == 0);
- hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, true, false);
- sge->vaddr += len;
- sge->length -= len;
- sge->sge_length -= len;
- if (sge->sge_length == 0) {
- if (--ssge.num_sge)
- *sge = *ssge.sg_list++;
- } else if (sge->length == 0 && sge->mr->lkey) {
- if (++sge->n >= RVT_SEGSZ) {
- if (++sge->m >= sge->mr->mapsz)
- break;
- sge->n = 0;
- }
- sge->vaddr =
- sge->mr->map[sge->m]->segs[sge->n].vaddr;
- sge->length =
- sge->mr->map[sge->m]->segs[sge->n].length;
- }
+ rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false);
+ rvt_update_sge(&ssge, len, false);
length -= len;
}
rvt_put_ss(&qp->r_sge);
@@ -259,7 +240,7 @@
if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) {
if (sqp->ibqp.qp_type == IB_QPT_GSI ||
sqp->ibqp.qp_type == IB_QPT_SMI)
- wc.pkey_index = swqe->ud_wr.pkey_index;
+ wc.pkey_index = rvt_get_swqe_pkey_index(swqe);
else
wc.pkey_index = sqp->s_pkey_index;
} else {
@@ -274,8 +255,7 @@
wc.dlid_path_bits = rdma_ah_get_dlid(ah_attr) & ((1 << ppd->lmc) - 1);
wc.port_num = qp->port_num;
/* Signal completion event if the solicited bit is set. */
- rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
- swqe->wr.send_flags & IB_SEND_SOLICITED);
+ rvt_recv_cq(qp, &wc, swqe->wr.send_flags & IB_SEND_SOLICITED);
ibp->rvp.n_loop_pkts++;
bail_unlock:
spin_unlock_irqrestore(&qp->r_lock, flags);
@@ -302,20 +282,21 @@
bth0 |= IB_BTH_SOLICITED;
bth0 |= extra_bytes << 20;
if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI)
- *pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
+ *pkey = hfi1_get_pkey(ibp, rvt_get_swqe_pkey_index(wqe));
else
*pkey = hfi1_get_pkey(ibp, qp->s_pkey_index);
if (!bypass)
bth0 |= *pkey;
ohdr->bth[0] = cpu_to_be32(bth0);
- ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn);
+ ohdr->bth[1] = cpu_to_be32(rvt_get_swqe_remote_qpn(wqe));
ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn));
/*
* Qkeys with the high order bit set mean use the
* qkey from the QP context instead of the WR (see 10.2.5).
*/
- ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
- qp->qkey : wqe->ud_wr.remote_qkey);
+ ohdr->u.ud.deth[0] =
+ cpu_to_be32((int)rvt_get_swqe_remote_qkey(wqe) < 0 ? qp->qkey :
+ rvt_get_swqe_remote_qkey(wqe));
ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
}
@@ -335,7 +316,7 @@
ibp = to_iport(qp->ibqp.device, qp->port_num);
ppd = ppd_from_ibp(ibp);
- ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+ ah_attr = rvt_get_swqe_ah_attr(wqe);
extra_bytes = -wqe->length & 3;
nwords = ((wqe->length + extra_bytes) >> 2) + SIZE_OF_CRC;
@@ -399,7 +380,7 @@
struct hfi1_pportdata *ppd;
struct hfi1_ibport *ibp;
u32 dlid, slid, nwords, extra_bytes;
- u32 dest_qp = wqe->ud_wr.remote_qpn;
+ u32 dest_qp = rvt_get_swqe_remote_qpn(wqe);
u32 src_qp = qp->ibqp.qp_num;
u16 len, pkey;
u8 l4, sc5;
@@ -407,7 +388,7 @@
ibp = to_iport(qp->ibqp.device, qp->port_num);
ppd = ppd_from_ibp(ibp);
- ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+ ah_attr = rvt_get_swqe_ah_attr(wqe);
/*
* Build 16B Management Packet if either the destination
@@ -469,7 +450,7 @@
if (is_mgmt) {
l4 = OPA_16B_L4_FM;
- pkey = hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index);
+ pkey = hfi1_get_pkey(ibp, rvt_get_swqe_pkey_index(wqe));
hfi1_16B_set_qpn(&ps->s_txreq->phdr.hdr.opah.u.mgmt,
dest_qp, src_qp);
} else {
@@ -518,7 +499,7 @@
goto bail;
}
wqe = rvt_get_swqe_ptr(qp, qp->s_last);
- hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+ rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
goto done_free_tx;
}
@@ -534,7 +515,7 @@
/* Construct the header. */
ibp = to_iport(qp->ibqp.device, qp->port_num);
ppd = ppd_from_ibp(ibp);
- ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr;
+ ah_attr = rvt_get_swqe_ah_attr(wqe);
priv->hdr_type = hfi1_get_hdr_type(ppd->lid, ah_attr);
if ((!hfi1_check_mcast(rdma_ah_get_dlid(ah_attr))) ||
(rdma_ah_get_dlid(ah_attr) == be32_to_cpu(OPA_LID_PERMISSIVE))) {
@@ -560,7 +541,7 @@
ud_loopback(qp, wqe);
spin_lock_irqsave(&qp->s_lock, tflags);
ps->flags = tflags;
- hfi1_send_complete(qp, wqe, IB_WC_SUCCESS);
+ rvt_send_complete(qp, wqe, IB_WC_SUCCESS);
goto done_free_tx;
}
}
@@ -656,18 +637,19 @@
u32 bth0, plen, vl, hwords = 7;
u16 len;
u8 l4;
- struct hfi1_16b_header hdr;
+ struct hfi1_opa_header hdr;
struct ib_other_headers *ohdr;
struct pio_buf *pbuf;
struct send_context *ctxt = qp_to_send_context(qp, sc5);
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
u32 nwords;
+ hdr.hdr_type = HFI1_PKT_TYPE_16B;
/* Populate length */
nwords = ((hfi1_get_16b_padding(hwords << 2, 0) +
SIZE_OF_LT) >> 2) + SIZE_OF_CRC;
if (old_grh) {
- struct ib_grh *grh = &hdr.u.l.grh;
+ struct ib_grh *grh = &hdr.opah.u.l.grh;
grh->version_tclass_flow = old_grh->version_tclass_flow;
grh->paylen = cpu_to_be16(
@@ -675,11 +657,11 @@
grh->hop_limit = 0xff;
grh->sgid = old_grh->dgid;
grh->dgid = old_grh->sgid;
- ohdr = &hdr.u.l.oth;
+ ohdr = &hdr.opah.u.l.oth;
l4 = OPA_16B_L4_IB_GLOBAL;
hwords += sizeof(struct ib_grh) / sizeof(u32);
} else {
- ohdr = &hdr.u.oth;
+ ohdr = &hdr.opah.u.oth;
l4 = OPA_16B_L4_IB_LOCAL;
}
@@ -693,7 +675,7 @@
/* Convert dwords to flits */
len = (hwords + nwords) >> 1;
- hfi1_make_16b_hdr(&hdr, slid, dlid, len, pkey, 1, 0, l4, sc5);
+ hfi1_make_16b_hdr(&hdr.opah, slid, dlid, len, pkey, 1, 0, l4, sc5);
plen = 2 /* PBC */ + hwords + nwords;
pbc_flags |= PBC_PACKET_BYPASS | PBC_INSERT_BYPASS_ICRC;
@@ -701,9 +683,11 @@
pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
if (ctxt) {
pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
- if (pbuf)
+ if (!IS_ERR_OR_NULL(pbuf)) {
+ trace_pio_output_ibhdr(ppd->dd, &hdr, sc5);
ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
&hdr, hwords);
+ }
}
}
@@ -715,14 +699,15 @@
u32 bth0, plen, vl, hwords = 5;
u16 lrh0;
u8 sl = ibp->sc_to_sl[sc5];
- struct ib_header hdr;
+ struct hfi1_opa_header hdr;
struct ib_other_headers *ohdr;
struct pio_buf *pbuf;
struct send_context *ctxt = qp_to_send_context(qp, sc5);
struct hfi1_pportdata *ppd = ppd_from_ibp(ibp);
+ hdr.hdr_type = HFI1_PKT_TYPE_9B;
if (old_grh) {
- struct ib_grh *grh = &hdr.u.l.grh;
+ struct ib_grh *grh = &hdr.ibh.u.l.grh;
grh->version_tclass_flow = old_grh->version_tclass_flow;
grh->paylen = cpu_to_be16(
@@ -730,11 +715,11 @@
grh->hop_limit = 0xff;
grh->sgid = old_grh->dgid;
grh->dgid = old_grh->sgid;
- ohdr = &hdr.u.l.oth;
+ ohdr = &hdr.ibh.u.l.oth;
lrh0 = HFI1_LRH_GRH;
hwords += sizeof(struct ib_grh) / sizeof(u32);
} else {
- ohdr = &hdr.u.oth;
+ ohdr = &hdr.ibh.u.oth;
lrh0 = HFI1_LRH_BTH;
}
@@ -746,16 +731,18 @@
ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << IB_BECN_SHIFT));
ohdr->bth[2] = 0; /* PSN 0 */
- hfi1_make_ib_hdr(&hdr, lrh0, hwords + SIZE_OF_CRC, dlid, slid);
+ hfi1_make_ib_hdr(&hdr.ibh, lrh0, hwords + SIZE_OF_CRC, dlid, slid);
plen = 2 /* PBC */ + hwords;
pbc_flags |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
vl = sc_to_vlt(ppd->dd, sc5);
pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen);
if (ctxt) {
pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL);
- if (pbuf)
+ if (!IS_ERR_OR_NULL(pbuf)) {
+ trace_pio_output_ibhdr(ppd->dd, &hdr, sc5);
ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc,
&hdr, hwords);
+ }
}
}
@@ -912,7 +899,7 @@
src_qp = hfi1_16B_get_src_qpn(packet->mgmt);
}
- process_ecn(qp, packet, (opcode != IB_OPCODE_CNP));
+ process_ecn(qp, packet);
/*
* Get the number of bytes the message was padded by
* and drop incomplete packets.
@@ -980,7 +967,6 @@
opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
wc.ex.imm_data = packet->ohdr->u.ud.imm_data;
wc.wc_flags = IB_WC_WITH_IMM;
- tlen -= sizeof(u32);
} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
wc.ex.imm_data = 0;
wc.wc_flags = 0;
@@ -1019,8 +1005,8 @@
goto drop;
}
if (packet->grh) {
- hfi1_copy_sge(&qp->r_sge, packet->grh,
- sizeof(struct ib_grh), true, false);
+ rvt_copy_sge(qp, &qp->r_sge, packet->grh,
+ sizeof(struct ib_grh), true, false);
wc.wc_flags |= IB_WC_GRH;
} else if (packet->etype == RHF_RCV_TYPE_BYPASS) {
struct ib_grh grh;
@@ -1030,14 +1016,14 @@
* out when creating 16B, add back the GRH here.
*/
hfi1_make_ext_grh(packet, &grh, slid, dlid);
- hfi1_copy_sge(&qp->r_sge, &grh,
- sizeof(struct ib_grh), true, false);
+ rvt_copy_sge(qp, &qp->r_sge, &grh,
+ sizeof(struct ib_grh), true, false);
wc.wc_flags |= IB_WC_GRH;
} else {
rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true);
}
- hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh),
- true, false);
+ rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh),
+ true, false);
rvt_put_ss(&qp->r_sge);
if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
return;
@@ -1075,7 +1061,7 @@
dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
wc.port_num = qp->port_num;
/* Signal completion event if the solicited bit is set. */
- rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, solicited);
+ rvt_recv_cq(qp, &wc, solicited);
return;
drop:
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
index dbe7d14..3592a9e 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
@@ -232,7 +232,7 @@
}
/* Verify that access is OK for the user buffer */
- if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
+ if (!access_ok((void __user *)vaddr,
npages * PAGE_SIZE)) {
dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
(void *)vaddr, npages);
@@ -324,6 +324,9 @@
u32 *tidlist = NULL;
struct tid_user_buf *tidbuf;
+ if (!PAGE_ALIGNED(tinfo->vaddr))
+ return -EINVAL;
+
tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
if (!tidbuf)
return -ENOMEM;
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
index e383cc0..43b105d 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
@@ -48,7 +48,6 @@
*/
#include "hfi.h"
-
#include "exp_rcv.h"
struct tid_pageset {
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index e341e6d..469acb9 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -91,9 +91,7 @@
/* Convert to number of pages */
size = DIV_ROUND_UP(size, PAGE_SIZE);
- down_read(&mm->mmap_sem);
- pinned = mm->pinned_vm;
- up_read(&mm->mmap_sem);
+ pinned = atomic64_read(&mm->pinned_vm);
/* First, check the absolute limit against all pinned pages. */
if (pinned + npages >= ulimit && !can_lock)
@@ -106,14 +104,13 @@
bool writable, struct page **pages)
{
int ret;
+ unsigned int gup_flags = FOLL_LONGTERM | (writable ? FOLL_WRITE : 0);
- ret = get_user_pages_fast(vaddr, npages, writable, pages);
+ ret = get_user_pages_fast(vaddr, npages, gup_flags, pages);
if (ret < 0)
return ret;
- down_write(&mm->mmap_sem);
- mm->pinned_vm += ret;
- up_write(&mm->mmap_sem);
+ atomic64_add(ret, &mm->pinned_vm);
return ret;
}
@@ -121,17 +118,9 @@
void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
size_t npages, bool dirty)
{
- size_t i;
-
- for (i = 0; i < npages; i++) {
- if (dirty)
- set_page_dirty_lock(p[i]);
- put_page(p[i]);
- }
+ put_user_pages_dirty_lock(p, npages, dirty);
if (mm) { /* during close after signal, mm can be NULL */
- down_write(&mm->mmap_sem);
- mm->pinned_vm -= npages;
- up_write(&mm->mmap_sem);
+ atomic64_sub(npages, &mm->pinned_vm);
}
}
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index 51831bf..fd754a1 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2015 - 2017 Intel Corporation.
+ * Copyright(c) 2015 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -76,8 +76,7 @@
static unsigned initial_pkt_count = 8;
-static int user_sdma_send_pkts(struct user_sdma_request *req,
- unsigned maxpkts);
+static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts);
static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status);
static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq);
static void user_sdma_free_request(struct user_sdma_request *req, bool unpin);
@@ -101,7 +100,7 @@
static int defer_packet_queue(
struct sdma_engine *sde,
- struct iowait *wait,
+ struct iowait_work *wait,
struct sdma_txreq *txreq,
uint seq,
bool pkts_sent);
@@ -124,33 +123,31 @@
static int defer_packet_queue(
struct sdma_engine *sde,
- struct iowait *wait,
+ struct iowait_work *wait,
struct sdma_txreq *txreq,
uint seq,
bool pkts_sent)
{
struct hfi1_user_sdma_pkt_q *pq =
- container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
- struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
- struct user_sdma_txreq *tx =
- container_of(txreq, struct user_sdma_txreq, txreq);
+ container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy);
- if (sdma_progress(sde, seq, txreq)) {
- if (tx->busycount++ < MAX_DEFER_RETRY_COUNT)
- goto eagain;
- }
+ write_seqlock(&sde->waitlock);
+ if (sdma_progress(sde, seq, txreq))
+ goto eagain;
/*
* We are assuming that if the list is enqueued somewhere, it
* is to the dmawait list since that is the only place where
* it is supposed to be enqueued.
*/
xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
- write_seqlock(&dev->iowait_lock);
- if (list_empty(&pq->busy.list))
+ if (list_empty(&pq->busy.list)) {
+ iowait_get_priority(&pq->busy);
iowait_queue(pkts_sent, &pq->busy, &sde->dmawait);
- write_sequnlock(&dev->iowait_lock);
+ }
+ write_sequnlock(&sde->waitlock);
return -EBUSY;
eagain:
+ write_sequnlock(&sde->waitlock);
return -EAGAIN;
}
@@ -192,8 +189,8 @@
atomic_set(&pq->n_locked, 0);
pq->mm = fd->mm;
- iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
- activate_packet_queue, NULL);
+ iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
+ activate_packet_queue, NULL, NULL);
pq->reqidx = 0;
pq->reqs = kcalloc(hfi1_sdma_comp_ring_size,
@@ -756,9 +753,10 @@
return ret;
}
-static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
+static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts)
{
- int ret = 0, count;
+ int ret = 0;
+ u16 count;
unsigned npkts = 0;
struct user_sdma_txreq *tx = NULL;
struct hfi1_user_sdma_pkt_q *pq = NULL;
@@ -803,7 +801,6 @@
tx->flags = 0;
tx->req = req;
- tx->busycount = 0;
INIT_LIST_HEAD(&tx->list);
/*
@@ -860,8 +857,10 @@
changes = set_txreq_header_ahg(req, tx,
datalen);
- if (changes < 0)
+ if (changes < 0) {
+ ret = changes;
goto free_tx;
+ }
}
} else {
ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
@@ -910,7 +909,9 @@
npkts++;
}
dosend:
- ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count);
+ ret = sdma_send_txlist(req->sde,
+ iowait_get_ib_work(&pq->busy),
+ &req->txps, &count);
req->seqsubmitted += count;
if (req->seqsubmitted == req->info.npkts) {
/*
@@ -1123,7 +1124,8 @@
0xffffffull),
psn = val & mask;
if (expct)
- psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
+ psn = (psn & ~HFI1_KDETH_BTH_SEQ_MASK) |
+ ((psn + frags) & HFI1_KDETH_BTH_SEQ_MASK);
else
psn = psn + frags;
return psn & mask;
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h
index 91c343f..9972e0e 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.h
+++ b/drivers/infiniband/hw/hfi1/user_sdma.h
@@ -110,12 +110,6 @@
SDMA_PKT_Q_DEFERRED,
};
-/*
- * Maximum retry attempts to submit a TX request
- * before putting the process to sleep.
- */
-#define MAX_DEFER_RETRY_COUNT 1
-
#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
#define SDMA_DBG(req, fmt, ...) \
@@ -204,12 +198,12 @@
s8 ahg_idx;
/* Writeable fields shared with interrupt */
- u64 seqcomp ____cacheline_aligned_in_smp;
- u64 seqsubmitted;
+ u16 seqcomp ____cacheline_aligned_in_smp;
+ u16 seqsubmitted;
/* Send side fields */
struct list_head txps ____cacheline_aligned_in_smp;
- u64 seqnum;
+ u16 seqnum;
/*
* KDETH.OFFSET (TID) field
* The offset can cover multiple packets, depending on the
@@ -245,8 +239,7 @@
struct list_head list;
struct user_sdma_request *req;
u16 flags;
- unsigned int busycount;
- u64 seqnum;
+ u16 seqnum;
};
int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt,
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 3dfb4cf..089e201 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -54,6 +54,7 @@
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <rdma/opa_addr.h>
+#include <linux/nospec.h>
#include "hfi.h"
#include "common.h"
@@ -129,8 +130,6 @@
module_param(piothreshold, ushort, S_IRUGO);
MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio");
-#define COPY_CACHELESS 1
-#define COPY_ADAPTIVE 2
static unsigned int sge_copy_mode;
module_param(sge_copy_mode, uint, S_IRUGO);
MODULE_PARM_DESC(sge_copy_mode,
@@ -148,171 +147,24 @@
/* Length of buffer to create verbs txreq cache name */
#define TXREQ_NAME_LEN 24
-/* 16B trailing buffer */
-static const u8 trail_buf[MAX_16B_PADDING];
-
-static uint wss_threshold;
+static uint wss_threshold = 80;
module_param(wss_threshold, uint, S_IRUGO);
MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy");
static uint wss_clean_period = 256;
module_param(wss_clean_period, uint, S_IRUGO);
MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned");
-/* memory working set size */
-struct hfi1_wss {
- unsigned long *entries;
- atomic_t total_count;
- atomic_t clean_counter;
- atomic_t clean_entry;
-
- int threshold;
- int num_entries;
- long pages_mask;
-};
-
-static struct hfi1_wss wss;
-
-int hfi1_wss_init(void)
-{
- long llc_size;
- long llc_bits;
- long table_size;
- long table_bits;
-
- /* check for a valid percent range - default to 80 if none or invalid */
- if (wss_threshold < 1 || wss_threshold > 100)
- wss_threshold = 80;
- /* reject a wildly large period */
- if (wss_clean_period > 1000000)
- wss_clean_period = 256;
- /* reject a zero period */
- if (wss_clean_period == 0)
- wss_clean_period = 1;
-
- /*
- * Calculate the table size - the next power of 2 larger than the
- * LLC size. LLC size is in KiB.
- */
- llc_size = wss_llc_size() * 1024;
- table_size = roundup_pow_of_two(llc_size);
-
- /* one bit per page in rounded up table */
- llc_bits = llc_size / PAGE_SIZE;
- table_bits = table_size / PAGE_SIZE;
- wss.pages_mask = table_bits - 1;
- wss.num_entries = table_bits / BITS_PER_LONG;
-
- wss.threshold = (llc_bits * wss_threshold) / 100;
- if (wss.threshold == 0)
- wss.threshold = 1;
-
- atomic_set(&wss.clean_counter, wss_clean_period);
-
- wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries),
- GFP_KERNEL);
- if (!wss.entries) {
- hfi1_wss_exit();
- return -ENOMEM;
- }
-
- return 0;
-}
-
-void hfi1_wss_exit(void)
-{
- /* coded to handle partially initialized and repeat callers */
- kfree(wss.entries);
- wss.entries = NULL;
-}
-
-/*
- * Advance the clean counter. When the clean period has expired,
- * clean an entry.
- *
- * This is implemented in atomics to avoid locking. Because multiple
- * variables are involved, it can be racy which can lead to slightly
- * inaccurate information. Since this is only a heuristic, this is
- * OK. Any innaccuracies will clean themselves out as the counter
- * advances. That said, it is unlikely the entry clean operation will
- * race - the next possible racer will not start until the next clean
- * period.
- *
- * The clean counter is implemented as a decrement to zero. When zero
- * is reached an entry is cleaned.
- */
-static void wss_advance_clean_counter(void)
-{
- int entry;
- int weight;
- unsigned long bits;
-
- /* become the cleaner if we decrement the counter to zero */
- if (atomic_dec_and_test(&wss.clean_counter)) {
- /*
- * Set, not add, the clean period. This avoids an issue
- * where the counter could decrement below the clean period.
- * Doing a set can result in lost decrements, slowing the
- * clean advance. Since this a heuristic, this possible
- * slowdown is OK.
- *
- * An alternative is to loop, advancing the counter by a
- * clean period until the result is > 0. However, this could
- * lead to several threads keeping another in the clean loop.
- * This could be mitigated by limiting the number of times
- * we stay in the loop.
- */
- atomic_set(&wss.clean_counter, wss_clean_period);
-
- /*
- * Uniquely grab the entry to clean and move to next.
- * The current entry is always the lower bits of
- * wss.clean_entry. The table size, wss.num_entries,
- * is always a power-of-2.
- */
- entry = (atomic_inc_return(&wss.clean_entry) - 1)
- & (wss.num_entries - 1);
-
- /* clear the entry and count the bits */
- bits = xchg(&wss.entries[entry], 0);
- weight = hweight64((u64)bits);
- /* only adjust the contended total count if needed */
- if (weight)
- atomic_sub(weight, &wss.total_count);
- }
-}
-
-/*
- * Insert the given address into the working set array.
- */
-static void wss_insert(void *address)
-{
- u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask;
- u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */
- u32 nr = page & (BITS_PER_LONG - 1);
-
- if (!test_and_set_bit(nr, &wss.entries[entry]))
- atomic_inc(&wss.total_count);
-
- wss_advance_clean_counter();
-}
-
-/*
- * Is the working set larger than the threshold?
- */
-static inline bool wss_exceeds_threshold(void)
-{
- return atomic_read(&wss.total_count) >= wss.threshold;
-}
-
/*
* Translate ib_wr_opcode into ib_wc_opcode.
*/
const enum ib_wc_opcode ib_hfi1_wc_opcode[] = {
[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+ [IB_WR_TID_RDMA_WRITE] = IB_WC_RDMA_WRITE,
[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
[IB_WR_SEND] = IB_WC_SEND,
[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+ [IB_WR_TID_RDMA_READ] = IB_WC_RDMA_READ,
[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD,
[IB_WR_SEND_WITH_INV] = IB_WC_SEND,
@@ -348,6 +200,14 @@
[IB_OPCODE_RC_FETCH_ADD] = 12 + 8 + 28,
[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = 12 + 8 + 4,
[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4,
+ [IB_OPCODE_TID_RDMA_READ_REQ] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_READ_RESP] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_WRITE_REQ] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_WRITE_RESP] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_WRITE_DATA] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_ACK] = 12 + 8 + 36,
+ [IB_OPCODE_TID_RDMA_RESYNC] = 12 + 8 + 36,
/* UC */
[IB_OPCODE_UC_SEND_FIRST] = 12 + 8,
[IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8,
@@ -391,6 +251,17 @@
[IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv,
[IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = &hfi1_rc_rcv,
[IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = &hfi1_rc_rcv,
+
+ /* TID RDMA has separate handlers for different opcodes.*/
+ [IB_OPCODE_TID_RDMA_WRITE_REQ] = &hfi1_rc_rcv_tid_rdma_write_req,
+ [IB_OPCODE_TID_RDMA_WRITE_RESP] = &hfi1_rc_rcv_tid_rdma_write_resp,
+ [IB_OPCODE_TID_RDMA_WRITE_DATA] = &hfi1_rc_rcv_tid_rdma_write_data,
+ [IB_OPCODE_TID_RDMA_WRITE_DATA_LAST] = &hfi1_rc_rcv_tid_rdma_write_data,
+ [IB_OPCODE_TID_RDMA_READ_REQ] = &hfi1_rc_rcv_tid_rdma_read_req,
+ [IB_OPCODE_TID_RDMA_READ_RESP] = &hfi1_rc_rcv_tid_rdma_read_resp,
+ [IB_OPCODE_TID_RDMA_RESYNC] = &hfi1_rc_rcv_tid_rdma_resync,
+ [IB_OPCODE_TID_RDMA_ACK] = &hfi1_rc_rcv_tid_rdma_ack,
+
/* UC */
[IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv,
[IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv,
@@ -438,79 +309,6 @@
*/
__be64 ib_hfi1_sys_image_guid;
-/**
- * hfi1_copy_sge - copy data to SGE memory
- * @ss: the SGE state
- * @data: the data to copy
- * @length: the length of the data
- * @release: boolean to release MR
- * @copy_last: do a separate copy of the last 8 bytes
- */
-void hfi1_copy_sge(
- struct rvt_sge_state *ss,
- void *data, u32 length,
- bool release,
- bool copy_last)
-{
- struct rvt_sge *sge = &ss->sge;
- int i;
- bool in_last = false;
- bool cacheless_copy = false;
-
- if (sge_copy_mode == COPY_CACHELESS) {
- cacheless_copy = length >= PAGE_SIZE;
- } else if (sge_copy_mode == COPY_ADAPTIVE) {
- if (length >= PAGE_SIZE) {
- /*
- * NOTE: this *assumes*:
- * o The first vaddr is the dest.
- * o If multiple pages, then vaddr is sequential.
- */
- wss_insert(sge->vaddr);
- if (length >= (2 * PAGE_SIZE))
- wss_insert(sge->vaddr + PAGE_SIZE);
-
- cacheless_copy = wss_exceeds_threshold();
- } else {
- wss_advance_clean_counter();
- }
- }
- if (copy_last) {
- if (length > 8) {
- length -= 8;
- } else {
- copy_last = false;
- in_last = true;
- }
- }
-
-again:
- while (length) {
- u32 len = rvt_get_sge_length(sge, length);
-
- WARN_ON_ONCE(len == 0);
- if (unlikely(in_last)) {
- /* enforce byte transfer ordering */
- for (i = 0; i < len; i++)
- ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i];
- } else if (cacheless_copy) {
- cacheless_memcpy(sge->vaddr, data, len);
- } else {
- memcpy(sge->vaddr, data, len);
- }
- rvt_update_sge(ss, len, release);
- data += len;
- length -= len;
- }
-
- if (copy_last) {
- copy_last = false;
- in_last = true;
- length = 8;
- goto again;
- }
-}
-
/*
* Make sure the QP is ready and able to accept the given opcode.
*/
@@ -529,7 +327,7 @@
static u64 hfi1_fault_tx(struct rvt_qp *qp, u8 opcode, u64 pbc)
{
#ifdef CONFIG_FAULT_INJECTION
- if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP)
+ if ((opcode & IB_OPCODE_MSP) == IB_OPCODE_MSP) {
/*
* In order to drop non-IB traffic we
* set PbcInsertHrc to NONE (0x2).
@@ -540,8 +338,9 @@
* packet will not be delivered to the
* correct context.
*/
+ pbc &= ~PBC_INSERT_HCRC_SMASK;
pbc |= (u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT;
- else
+ } else {
/*
* In order to drop regular verbs
* traffic we set the PbcTestEbp
@@ -551,10 +350,129 @@
* triggered and will be dropped.
*/
pbc |= PBC_TEST_EBP;
+ }
#endif
return pbc;
}
+static opcode_handler tid_qp_ok(int opcode, struct hfi1_packet *packet)
+{
+ if (packet->qp->ibqp.qp_type != IB_QPT_RC ||
+ !(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK))
+ return NULL;
+ if ((opcode & RVT_OPCODE_QP_MASK) == IB_OPCODE_TID_RDMA)
+ return opcode_handler_tbl[opcode];
+ return NULL;
+}
+
+void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct ib_header *hdr = packet->hdr;
+ u32 tlen = packet->tlen;
+ struct hfi1_pportdata *ppd = rcd->ppd;
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+ struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+ opcode_handler opcode_handler;
+ unsigned long flags;
+ u32 qp_num;
+ int lnh;
+ u8 opcode;
+
+ /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
+ if (unlikely(tlen < 15 * sizeof(u32)))
+ goto drop;
+
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ if (lnh != HFI1_LRH_BTH)
+ goto drop;
+
+ packet->ohdr = &hdr->u.oth;
+ trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+ opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+ inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+ /* verbs_qp can be picked up from any tid_rdma header struct */
+ qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_req.verbs_qp) &
+ RVT_QPN_MASK;
+
+ rcu_read_lock();
+ packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+ if (!packet->qp)
+ goto drop_rcu;
+ spin_lock_irqsave(&packet->qp->r_lock, flags);
+ opcode_handler = tid_qp_ok(opcode, packet);
+ if (likely(opcode_handler))
+ opcode_handler(packet);
+ else
+ goto drop_unlock;
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+ rcu_read_unlock();
+
+ return;
+drop_unlock:
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+drop_rcu:
+ rcu_read_unlock();
+drop:
+ ibp->rvp.n_pkt_drops++;
+}
+
+void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet)
+{
+ struct hfi1_ctxtdata *rcd = packet->rcd;
+ struct ib_header *hdr = packet->hdr;
+ u32 tlen = packet->tlen;
+ struct hfi1_pportdata *ppd = rcd->ppd;
+ struct hfi1_ibport *ibp = &ppd->ibport_data;
+ struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi;
+ opcode_handler opcode_handler;
+ unsigned long flags;
+ u32 qp_num;
+ int lnh;
+ u8 opcode;
+
+ /* DW == LRH (2) + BTH (3) + KDETH (9) + CRC (1) */
+ if (unlikely(tlen < 15 * sizeof(u32)))
+ goto drop;
+
+ lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+ if (lnh != HFI1_LRH_BTH)
+ goto drop;
+
+ packet->ohdr = &hdr->u.oth;
+ trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+
+ opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24);
+ inc_opstats(tlen, &rcd->opstats->stats[opcode]);
+
+ /* verbs_qp can be picked up from any tid_rdma header struct */
+ qp_num = be32_to_cpu(packet->ohdr->u.tid_rdma.r_rsp.verbs_qp) &
+ RVT_QPN_MASK;
+
+ rcu_read_lock();
+ packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num);
+ if (!packet->qp)
+ goto drop_rcu;
+ spin_lock_irqsave(&packet->qp->r_lock, flags);
+ opcode_handler = tid_qp_ok(opcode, packet);
+ if (likely(opcode_handler))
+ opcode_handler(packet);
+ else
+ goto drop_unlock;
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+ rcu_read_unlock();
+
+ return;
+drop_unlock:
+ spin_unlock_irqrestore(&packet->qp->r_lock, flags);
+drop_rcu:
+ rcu_read_unlock();
+drop:
+ ibp->rvp.n_pkt_drops++;
+}
+
static int hfi1_do_pkey_check(struct hfi1_packet *packet)
{
struct hfi1_ctxtdata *rcd = packet->rcd;
@@ -713,11 +631,13 @@
spin_lock(&qp->s_lock);
if (tx->wqe) {
- hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
+ rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
} else if (qp->ibqp.qp_type == IB_QPT_RC) {
struct hfi1_opa_header *hdr;
hdr = &tx->phdr.hdr;
+ if (unlikely(status == SDMA_TXREQ_S_ABORTED))
+ hfi1_rc_verbs_aborted(qp, hdr);
hfi1_rc_send_complete(qp, hdr);
}
spin_unlock(&qp->s_lock);
@@ -725,11 +645,28 @@
hfi1_put_txreq(tx);
}
+void hfi1_wait_kmem(struct rvt_qp *qp)
+{
+ struct hfi1_qp_priv *priv = qp->priv;
+ struct ib_qp *ibqp = &qp->ibqp;
+ struct ib_device *ibdev = ibqp->device;
+ struct hfi1_ibdev *dev = to_idev(ibdev);
+
+ if (list_empty(&priv->s_iowait.list)) {
+ if (list_empty(&dev->memwait))
+ mod_timer(&dev->mem_timer, jiffies + 1);
+ qp->s_flags |= RVT_S_WAIT_KMEM;
+ list_add_tail(&priv->s_iowait.list, &dev->memwait);
+ priv->s_iowait.lock = &dev->iowait_lock;
+ trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
+ rvt_get_qp(qp);
+ }
+}
+
static int wait_kmem(struct hfi1_ibdev *dev,
struct rvt_qp *qp,
struct hfi1_pkt_state *ps)
{
- struct hfi1_qp_priv *priv = qp->priv;
unsigned long flags;
int ret = 0;
@@ -737,18 +674,10 @@
if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
write_seqlock(&dev->iowait_lock);
list_add_tail(&ps->s_txreq->txreq.list,
- &priv->s_iowait.tx_head);
- if (list_empty(&priv->s_iowait.list)) {
- if (list_empty(&dev->memwait))
- mod_timer(&dev->mem_timer, jiffies + 1);
- qp->s_flags |= RVT_S_WAIT_KMEM;
- list_add_tail(&priv->s_iowait.list, &dev->memwait);
- priv->s_iowait.lock = &dev->iowait_lock;
- trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM);
- rvt_get_qp(qp);
- }
+ &ps->wait->tx_head);
+ hfi1_wait_kmem(qp);
write_sequnlock(&dev->iowait_lock);
- qp->s_flags &= ~RVT_S_BUSY;
+ hfi1_qp_unbusy(qp, ps->wait);
ret = -EBUSY;
}
spin_unlock_irqrestore(&qp->s_lock, flags);
@@ -774,11 +703,7 @@
int ret = 0;
while (length) {
- len = ss->sge.length;
- if (len > length)
- len = length;
- if (len > ss->sge.sge_length)
- len = ss->sge.sge_length;
+ len = rvt_get_sge_length(&ss->sge, length);
WARN_ON_ONCE(len == 0);
ret = sdma_txadd_kvaddr(
sde->dd,
@@ -892,13 +817,22 @@
/* add icrc, lt byte, and padding to flit */
if (extra_bytes)
- ret = sdma_txadd_kvaddr(sde->dd, &tx->txreq,
- (void *)trail_buf, extra_bytes);
+ ret = sdma_txadd_daddr(sde->dd, &tx->txreq,
+ sde->dd->sdma_pad_phys, extra_bytes);
bail_txadd:
return ret;
}
+static u64 update_hcrc(u8 opcode, u64 pbc)
+{
+ if ((opcode & IB_OPCODE_TID_RDMA) == IB_OPCODE_TID_RDMA) {
+ pbc &= ~PBC_INSERT_HCRC_SMASK;
+ pbc |= (u64)PBC_IHCRC_LKDETH << PBC_INSERT_HCRC_SHIFT;
+ }
+ return pbc;
+}
+
int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
u64 pbc)
{
@@ -937,21 +871,24 @@
else
pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
- if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
- pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
pbc = create_pbc(ppd,
pbc,
qp->srate_mbps,
vl,
plen);
+
+ if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
+ pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
+ else
+ /* Update HCRC based on packet opcode */
+ pbc = update_hcrc(ps->opcode, pbc);
}
tx->wqe = qp->s_wqe;
ret = build_verbs_tx_desc(tx->sde, len, tx, ahg_info, pbc);
if (unlikely(ret))
goto bail_build;
}
- ret = sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq,
- ps->pkts_sent);
+ ret = sdma_send_txreq(tx->sde, ps->wait, &tx->txreq, ps->pkts_sent);
if (unlikely(ret < 0)) {
if (ret == -ECOMM)
goto bail_ecomm;
@@ -987,7 +924,6 @@
{
struct hfi1_qp_priv *priv = qp->priv;
struct hfi1_devdata *dd = sc->dd;
- struct hfi1_ibdev *dev = &dd->verbs_dev;
unsigned long flags;
int ret = 0;
@@ -999,9 +935,9 @@
*/
spin_lock_irqsave(&qp->s_lock, flags);
if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
- write_seqlock(&dev->iowait_lock);
+ write_seqlock(&sc->waitlock);
list_add_tail(&ps->s_txreq->txreq.list,
- &priv->s_iowait.tx_head);
+ &ps->wait->tx_head);
if (list_empty(&priv->s_iowait.list)) {
struct hfi1_ibdev *dev = &dd->verbs_dev;
int was_empty;
@@ -1010,17 +946,18 @@
dev->n_piodrain += !!(flag & HFI1_S_WAIT_PIO_DRAIN);
qp->s_flags |= flag;
was_empty = list_empty(&sc->piowait);
+ iowait_get_priority(&priv->s_iowait);
iowait_queue(ps->pkts_sent, &priv->s_iowait,
&sc->piowait);
- priv->s_iowait.lock = &dev->iowait_lock;
+ priv->s_iowait.lock = &sc->waitlock;
trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO);
rvt_get_qp(qp);
/* counting: only call wantpiobuf_intr if first user */
if (was_empty)
hfi1_sc_wantpiobuf_intr(sc, 1);
}
- write_sequnlock(&dev->iowait_lock);
- qp->s_flags &= ~RVT_S_BUSY;
+ write_sequnlock(&sc->waitlock);
+ hfi1_qp_unbusy(qp, ps->wait);
ret = -EBUSY;
}
spin_unlock_irqrestore(&qp->s_lock, flags);
@@ -1091,17 +1028,20 @@
else
pbc |= (ib_is_sc5(sc5) << PBC_DC_INFO_SHIFT);
+ pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
if (unlikely(hfi1_dbg_should_fault_tx(qp, ps->opcode)))
pbc = hfi1_fault_tx(qp, ps->opcode, pbc);
- pbc = create_pbc(ppd, pbc, qp->srate_mbps, vl, plen);
+ else
+ /* Update HCRC based on packet opcode */
+ pbc = update_hcrc(ps->opcode, pbc);
}
if (cb)
iowait_pio_inc(&priv->s_iowait);
pbuf = sc_buffer_alloc(sc, plen, cb, qp);
- if (unlikely(!pbuf)) {
+ if (IS_ERR_OR_NULL(pbuf)) {
if (cb)
verbs_pio_complete(qp, 0);
- if (ppd->host_link_state != HLS_UP_ACTIVE) {
+ if (IS_ERR(pbuf)) {
/*
* If we have filled the PIO buffers to capacity and are
* not in an active state this request is not going to
@@ -1137,10 +1077,8 @@
if (ss) {
while (len) {
void *addr = ss->sge.vaddr;
- u32 slen = ss->sge.length;
+ u32 slen = rvt_get_sge_length(&ss->sge, len);
- if (slen > len)
- slen = len;
rvt_update_sge(ss, slen, false);
seg_pio_copy_mid(pbuf, addr, slen);
len -= slen;
@@ -1148,7 +1086,8 @@
}
/* add icrc, lt byte, and padding to flit */
if (extra_bytes)
- seg_pio_copy_mid(pbuf, trail_buf, extra_bytes);
+ seg_pio_copy_mid(pbuf, ppd->dd->sdma_pad_dma,
+ extra_bytes);
seg_pio_copy_end(pbuf);
}
@@ -1158,15 +1097,15 @@
&ps->s_txreq->phdr.hdr, ib_is_sc5(sc5));
pio_bail:
+ spin_lock_irqsave(&qp->s_lock, flags);
if (qp->s_wqe) {
- spin_lock_irqsave(&qp->s_lock, flags);
- hfi1_send_complete(qp, qp->s_wqe, wc_status);
- spin_unlock_irqrestore(&qp->s_lock, flags);
+ rvt_send_complete(qp, qp->s_wqe, wc_status);
} else if (qp->ibqp.qp_type == IB_QPT_RC) {
- spin_lock_irqsave(&qp->s_lock, flags);
+ if (unlikely(wc_status == IB_WC_GENERAL_ERR))
+ hfi1_rc_verbs_aborted(qp, &ps->s_txreq->phdr.hdr);
hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr);
- spin_unlock_irqrestore(&qp->s_lock, flags);
}
+ spin_unlock_irqrestore(&qp->s_lock, flags);
ret = 0;
@@ -1286,15 +1225,16 @@
case IB_QPT_UD:
break;
case IB_QPT_UC:
- case IB_QPT_RC: {
+ case IB_QPT_RC:
+ priv->s_running_pkt_size =
+ (tx->s_cur_size + priv->s_running_pkt_size) / 2;
if (piothreshold &&
- tx->s_cur_size <= min(piothreshold, qp->pmtu) &&
+ priv->s_running_pkt_size <= min(piothreshold, qp->pmtu) &&
(BIT(ps->opcode & OPMASK) & pio_opmask[ps->opcode >> 5]) &&
iowait_sdma_pending(&priv->s_iowait) == 0 &&
!sdma_txreq_built(&tx->txreq))
return dd->process_pio_send;
break;
- }
default:
break;
}
@@ -1367,7 +1307,7 @@
hfi1_cdbg(PIO, "%s() Failed. Completing with err",
__func__);
spin_lock_irqsave(&qp->s_lock, flags);
- hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
+ rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
spin_unlock_irqrestore(&qp->s_lock, flags);
}
return -EINVAL;
@@ -1409,15 +1349,15 @@
rdi->dparms.props.max_mr_size = U64_MAX;
rdi->dparms.props.max_fast_reg_page_list_len = UINT_MAX;
rdi->dparms.props.max_qp = hfi1_max_qps;
- rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs;
+ rdi->dparms.props.max_qp_wr =
+ (hfi1_max_qp_wrs >= HFI1_QP_WQE_INVALID ?
+ HFI1_QP_WQE_INVALID - 1 : hfi1_max_qp_wrs);
rdi->dparms.props.max_send_sge = hfi1_max_sges;
rdi->dparms.props.max_recv_sge = hfi1_max_sges;
rdi->dparms.props.max_sge_rd = hfi1_max_sges;
rdi->dparms.props.max_cq = hfi1_max_cqs;
rdi->dparms.props.max_ah = hfi1_max_ahs;
rdi->dparms.props.max_cqe = hfi1_max_cqes;
- rdi->dparms.props.max_mr = rdi->lkey_table.max;
- rdi->dparms.props.max_fmr = rdi->lkey_table.max;
rdi->dparms.props.max_map_per_fmr = 32767;
rdi->dparms.props.max_pd = hfi1_max_pds;
rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
@@ -1596,6 +1536,7 @@
sl = rdma_ah_get_sl(ah_attr);
if (sl >= ARRAY_SIZE(ibp->sl_to_sc))
return -EINVAL;
+ sl = array_index_nospec(sl, ARRAY_SIZE(ibp->sl_to_sc));
sc5 = ibp->sl_to_sc[sl];
if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf)
@@ -1800,15 +1741,15 @@
static u64 hfi1_sps_ints(void)
{
- unsigned long flags;
+ unsigned long index, flags;
struct hfi1_devdata *dd;
u64 sps_ints = 0;
- spin_lock_irqsave(&hfi1_devs_lock, flags);
- list_for_each_entry(dd, &hfi1_dev_list, list) {
+ xa_lock_irqsave(&hfi1_dev_table, flags);
+ xa_for_each(&hfi1_dev_table, index, dd) {
sps_ints += get_all_cpu_total(dd->int_counter);
}
- spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+ xa_unlock_irqrestore(&hfi1_dev_table, flags);
return sps_ints;
}
@@ -1838,6 +1779,20 @@
return count;
}
+static const struct ib_device_ops hfi1_dev_ops = {
+ .owner = THIS_MODULE,
+ .driver_id = RDMA_DRIVER_HFI1,
+
+ .alloc_hw_stats = alloc_hw_stats,
+ .alloc_rdma_netdev = hfi1_vnic_alloc_rn,
+ .get_dev_fw_str = hfi1_get_dev_fw_str,
+ .get_hw_stats = get_hw_stats,
+ .init_port = hfi1_create_port_files,
+ .modify_device = modify_device,
+ /* keep process mad in the driver */
+ .process_mad = hfi1_process_mad,
+};
+
/**
* hfi1_register_ib_device - register our device with the infiniband core
* @dd: the device data structure
@@ -1878,17 +1833,10 @@
*/
if (!ib_hfi1_sys_image_guid)
ib_hfi1_sys_image_guid = ibdev->node_guid;
- ibdev->owner = THIS_MODULE;
ibdev->phys_port_cnt = dd->num_pports;
ibdev->dev.parent = &dd->pcidev->dev;
- ibdev->modify_device = modify_device;
- ibdev->alloc_hw_stats = alloc_hw_stats;
- ibdev->get_hw_stats = get_hw_stats;
- ibdev->alloc_rdma_netdev = hfi1_vnic_alloc_rn;
- /* keep process mad in the driver */
- ibdev->process_mad = hfi1_process_mad;
- ibdev->get_dev_fw_str = hfi1_get_dev_fw_str;
+ ib_set_device_ops(ibdev, &hfi1_dev_ops);
strlcpy(ibdev->node_desc, init_utsname()->nodename,
sizeof(ibdev->node_desc));
@@ -1896,7 +1844,6 @@
/*
* Fill in rvt info object.
*/
- dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files;
dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev;
dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah;
dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah;
@@ -1926,6 +1873,7 @@
dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE;
dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc;
+ dd->verbs_dev.rdi.driver_f.qp_priv_init = hfi1_qp_priv_init;
dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free;
dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps;
dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset;
@@ -1943,7 +1891,7 @@
dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp;
dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp;
dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc;
- dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe;
+ dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe;
dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup =
hfi1_comp_vect_mappings_lookup;
@@ -1956,10 +1904,18 @@
dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size;
dd->verbs_dev.rdi.dparms.nports = dd->num_pports;
dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd);
+ dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode;
+ dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold;
+ dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period;
+ dd->verbs_dev.rdi.dparms.reserved_operations = 1;
+ dd->verbs_dev.rdi.dparms.extra_rdma_atomic = HFI1_TID_RDMA_WRITE_CNT;
/* post send table */
dd->verbs_dev.rdi.post_parms = hfi1_post_parms;
+ /* opcode translation table */
+ dd->verbs_dev.rdi.wc_opcode = ib_hfi1_wc_opcode;
+
ppd = dd->pport;
for (i = 0; i < dd->num_pports; i++, ppd++)
rvt_init_port(&dd->verbs_dev.rdi,
@@ -1967,7 +1923,10 @@
i,
ppd->pkeys);
- ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_HFI1);
+ rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev,
+ &ib_hfi1_attr_group);
+
+ ret = rvt_register_device(&dd->verbs_dev.rdi);
if (ret)
goto err_verbs_txreq;
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index a4d0650..ae9582d 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -71,6 +71,8 @@
struct hfi1_packet;
#include "iowait.h"
+#include "tid_rdma.h"
+#include "opfn.h"
#define HFI1_MAX_RDMA_ATOMIC 16
@@ -156,21 +158,83 @@
struct hfi1_ahg_info *s_ahg; /* ahg info for next header */
struct sdma_engine *s_sde; /* current sde */
struct send_context *s_sendcontext; /* current sendcontext */
+ struct hfi1_ctxtdata *rcd; /* QP's receive context */
+ struct page **pages; /* for TID page scan */
+ u32 tid_enqueue; /* saved when tid waited */
u8 s_sc; /* SC[0..4] for next packet */
struct iowait s_iowait;
+ struct timer_list s_tid_timer; /* for timing tid wait */
+ struct timer_list s_tid_retry_timer; /* for timing tid ack */
+ struct list_head tid_wait; /* for queueing tid space */
+ struct hfi1_opfn_data opfn;
+ struct tid_flow_state flow_state;
+ struct tid_rdma_qp_params tid_rdma;
struct rvt_qp *owner;
+ u16 s_running_pkt_size;
u8 hdr_type; /* 9B or 16B */
+ struct rvt_sge_state tid_ss; /* SGE state pointer for 2nd leg */
+ atomic_t n_requests; /* # of TID RDMA requests in the */
+ /* queue */
+ atomic_t n_tid_requests; /* # of sent TID RDMA requests */
+ unsigned long tid_timer_timeout_jiffies;
+ unsigned long tid_retry_timeout_jiffies;
+
+ /* variables for the TID RDMA SE state machine */
+ u8 s_state;
+ u8 s_retry;
+ u8 rnr_nak_state; /* RNR NAK state */
+ u8 s_nak_state;
+ u32 s_nak_psn;
+ u32 s_flags;
+ u32 s_tid_cur;
+ u32 s_tid_head;
+ u32 s_tid_tail;
+ u32 r_tid_head; /* Most recently added TID RDMA request */
+ u32 r_tid_tail; /* the last completed TID RDMA request */
+ u32 r_tid_ack; /* the TID RDMA request to be ACK'ed */
+ u32 r_tid_alloc; /* Request for which we are allocating resources */
+ u32 pending_tid_w_segs; /* Num of pending tid write segments */
+ u32 pending_tid_w_resp; /* Num of pending tid write responses */
+ u32 alloc_w_segs; /* Number of segments for which write */
+ /* resources have been allocated for this QP */
+
+ /* For TID RDMA READ */
+ u32 tid_r_reqs; /* Num of tid reads requested */
+ u32 tid_r_comp; /* Num of tid reads completed */
+ u32 pending_tid_r_segs; /* Num of pending tid read segments */
+ u16 pkts_ps; /* packets per segment */
+ u8 timeout_shift; /* account for number of packets per segment */
+
+ u32 r_next_psn_kdeth;
+ u32 r_next_psn_kdeth_save;
+ u32 s_resync_psn;
+ u8 sync_pt; /* Set when QP reaches sync point */
+ u8 resync;
+};
+
+#define HFI1_QP_WQE_INVALID ((u32)-1)
+
+struct hfi1_swqe_priv {
+ struct tid_rdma_request tid_req;
+ struct rvt_sge_state ss; /* Used for TID RDMA READ Request */
+};
+
+struct hfi1_ack_priv {
+ struct rvt_sge_state ss; /* used for TID WRITE RESP */
+ struct tid_rdma_request tid_req;
};
/*
* This structure is used to hold commonly lookedup and computed values during
* the send engine progress.
*/
+struct iowait_work;
struct hfi1_pkt_state {
struct hfi1_ibdev *dev;
struct hfi1_ibport *ibp;
struct hfi1_pportdata *ppd;
struct verbs_txreq *s_txreq;
+ struct iowait_work *wait;
unsigned long flags;
unsigned long timeout;
unsigned long timeout_int;
@@ -221,6 +285,7 @@
struct kmem_cache *verbs_txreq_cache;
u64 n_txwait;
u64 n_kmem_wait;
+ u64 n_tidwait;
/* protect iowait lists */
seqlock_t iowait_lock ____cacheline_aligned_in_smp;
@@ -247,7 +312,7 @@
return container_of(rdi, struct hfi1_ibdev, rdi);
}
-static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait)
+static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait)
{
struct hfi1_qp_priv *priv;
@@ -308,14 +373,36 @@
return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT;
}
+static inline struct tid_rdma_request *wqe_to_tid_req(struct rvt_swqe *wqe)
+{
+ return &((struct hfi1_swqe_priv *)wqe->priv)->tid_req;
+}
+
+static inline struct tid_rdma_request *ack_to_tid_req(struct rvt_ack_entry *e)
+{
+ return &((struct hfi1_ack_priv *)e->priv)->tid_req;
+}
+
+/*
+ * Look through all the active flows for a TID RDMA request and find
+ * the one (if it exists) that contains the specified PSN.
+ */
+static inline u32 __full_flow_psn(struct flow_state *state, u32 psn)
+{
+ return mask_psn((state->generation << HFI1_KDETH_BTH_SEQ_SHIFT) |
+ (psn & HFI1_KDETH_BTH_SEQ_MASK));
+}
+
+static inline u32 full_flow_psn(struct tid_rdma_flow *flow, u32 psn)
+{
+ return __full_flow_psn(&flow->flow_state, psn);
+}
+
struct verbs_txreq;
void hfi1_put_txreq(struct verbs_txreq *tx);
int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
-void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length,
- bool release, bool copy_last);
-
void hfi1_cnp_rcv(struct hfi1_packet *packet);
void hfi1_uc_rcv(struct hfi1_packet *packet);
@@ -329,6 +416,7 @@
u8 ah_to_sc(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr);
+void hfi1_rc_verbs_aborted(struct rvt_qp *qp, struct hfi1_opa_header *opah);
void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah);
void hfi1_ud_rcv(struct hfi1_packet *packet);
@@ -343,7 +431,8 @@
void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
int attr_mask, struct ib_udata *udata);
void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait);
-int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe);
+int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe,
+ bool *call_send);
extern const u32 rc_only_opcode;
extern const u32 uc_only_opcode;
@@ -354,18 +443,18 @@
const struct ib_global_route *grh, u32 hwords, u32 nwords);
void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
- u32 bth0, u32 bth2, int middle,
+ u32 bth0, u32 bth1, u32 bth2, int middle,
struct hfi1_pkt_state *ps);
+bool hfi1_schedule_send_yield(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
+ bool tid);
+
void _hfi1_do_send(struct work_struct *work);
void hfi1_do_send_from_rvt(struct rvt_qp *qp);
void hfi1_do_send(struct rvt_qp *qp, bool in_thread);
-void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
- enum ib_wc_status status);
-
void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn);
int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
@@ -378,6 +467,10 @@
void hfi1_unregister_ib_device(struct hfi1_devdata *);
+void hfi1_kdeth_eager_rcv(struct hfi1_packet *packet);
+
+void hfi1_kdeth_expected_rcv(struct hfi1_packet *packet);
+
void hfi1_ib_rcv(struct hfi1_packet *packet);
void hfi1_16B_rcv(struct hfi1_packet *packet);
@@ -390,33 +483,21 @@
int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps,
u64 pbc);
-int hfi1_wss_init(void);
-void hfi1_wss_exit(void);
-
-/* platform specific: return the lowest level cache (llc) size, in KiB */
-static inline int wss_llc_size(void)
-{
- /* assume that the boot CPU value is universal for all CPUs */
- return boot_cpu_data.x86_cache_size;
-}
-
-/* platform specific: cacheless copy */
-static inline void cacheless_memcpy(void *dst, void *src, size_t n)
-{
- /*
- * Use the only available X64 cacheless copy. Add a __user cast
- * to quiet sparse. The src agument is already in the kernel so
- * there are no security issues. The extra fault recovery machinery
- * is not invoked.
- */
- __copy_user_nocache(dst, (void __user *)src, n, 0);
-}
-
static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr)
{
return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ);
}
+void hfi1_wait_kmem(struct rvt_qp *qp);
+
+static inline void hfi1_trdma_send_complete(struct rvt_qp *qp,
+ struct rvt_swqe *wqe,
+ enum ib_wc_status status)
+{
+ trdma_clean_swqe(qp, wqe);
+ rvt_send_complete(qp, wqe, status);
+}
+
extern const enum ib_wc_opcode ib_hfi1_wc_opcode[];
extern const u8 hdr_len_by_opcode[];
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.c b/drivers/infiniband/hw/hfi1/verbs_txreq.c
index c4ab2d5..8f766dd 100644
--- a/drivers/infiniband/hw/hfi1/verbs_txreq.c
+++ b/drivers/infiniband/hw/hfi1/verbs_txreq.c
@@ -100,7 +100,7 @@
if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) {
struct hfi1_qp_priv *priv;
- tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+ tx = kmem_cache_alloc(dev->verbs_txreq_cache, VERBS_TXREQ_GFP);
if (tx)
goto out;
priv = qp->priv;
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h
index 1c19bbc..bfa6e08 100644
--- a/drivers/infiniband/hw/hfi1/verbs_txreq.h
+++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h
@@ -72,6 +72,7 @@
struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev,
struct rvt_qp *qp);
+#define VERBS_TXREQ_GFP (GFP_ATOMIC | __GFP_NOWARN)
static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev,
struct rvt_qp *qp)
__must_hold(&qp->slock)
@@ -79,7 +80,7 @@
struct verbs_txreq *tx;
struct hfi1_qp_priv *priv = qp->priv;
- tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC);
+ tx = kmem_cache_alloc(dev->verbs_txreq_cache, VERBS_TXREQ_GFP);
if (unlikely(!tx)) {
/* call slow path to get the lock */
tx = __get_txreq(dev, qp);
@@ -94,6 +95,7 @@
tx->txreq.num_desc = 0;
/* Set the header type */
tx->phdr.hdr.hdr_type = priv->hdr_type;
+ tx->txreq.flags = 0;
return tx;
}
@@ -102,22 +104,19 @@
return &tx->txreq;
}
-static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp)
+static inline struct verbs_txreq *get_waiting_verbs_txreq(struct iowait_work *w)
{
struct sdma_txreq *stx;
- struct hfi1_qp_priv *priv = qp->priv;
- stx = iowait_get_txhead(&priv->s_iowait);
+ stx = iowait_get_txhead(w);
if (stx)
return container_of(stx, struct verbs_txreq, txreq);
return NULL;
}
-static inline bool verbs_txreq_queued(struct rvt_qp *qp)
+static inline bool verbs_txreq_queued(struct iowait_work *w)
{
- struct hfi1_qp_priv *priv = qp->priv;
-
- return iowait_packet_queued(&priv->s_iowait);
+ return iowait_packet_queued(w);
}
void hfi1_put_txreq(struct verbs_txreq *tx);
diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c
index c643d80..b49e60e 100644
--- a/drivers/infiniband/hw/hfi1/vnic_main.c
+++ b/drivers/infiniband/hw/hfi1/vnic_main.c
@@ -120,7 +120,7 @@
uctxt->seq_cnt = 1;
uctxt->is_vnic = true;
- hfi1_set_vnic_msix_info(uctxt);
+ msix_request_rcd_irq(uctxt);
hfi1_stats.sps_ctxts++;
dd_dev_dbg(dd, "created vnic context %d\n", uctxt->ctxt);
@@ -135,8 +135,6 @@
dd_dev_dbg(dd, "closing vnic context %d\n", uctxt->ctxt);
flush_wc();
- hfi1_reset_vnic_msix_info(uctxt);
-
/*
* Disable receive context and interrupt available, reset all
* RcvCtxtCtrl bits to default values.
@@ -148,6 +146,10 @@
HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt);
+ /* msix_intr will always be > 0, only clean up if this is true */
+ if (uctxt->msix_intr)
+ msix_free_irq(dd, uctxt->msix_intr);
+
uctxt->event_flags = 0;
hfi1_clear_tids(uctxt);
@@ -160,12 +162,12 @@
void hfi1_vnic_setup(struct hfi1_devdata *dd)
{
- idr_init(&dd->vnic.vesw_idr);
+ xa_init(&dd->vnic.vesws);
}
void hfi1_vnic_cleanup(struct hfi1_devdata *dd)
{
- idr_destroy(&dd->vnic.vesw_idr);
+ WARN_ON(!xa_empty(&dd->vnic.vesws));
}
#define SUM_GRP_COUNTERS(stats, qstats, x_grp) do { \
@@ -421,8 +423,7 @@
static u16 hfi1_vnic_select_queue(struct net_device *netdev,
struct sk_buff *skb,
- struct net_device *sb_dev,
- select_queue_fallback_t fallback)
+ struct net_device *sb_dev)
{
struct hfi1_vnic_vport_info *vinfo = opa_vnic_dev_priv(netdev);
struct opa_vnic_skb_mdata *mdata;
@@ -532,7 +533,7 @@
l4_type = hfi1_16B_get_l4(packet->ebuf);
if (likely(l4_type == OPA_16B_L4_ETHR)) {
vesw_id = HFI1_VNIC_GET_VESWID(packet->ebuf);
- vinfo = idr_find(&dd->vnic.vesw_idr, vesw_id);
+ vinfo = xa_load(&dd->vnic.vesws, vesw_id);
/*
* In case of invalid vesw id, count the error on
@@ -540,9 +541,10 @@
*/
if (unlikely(!vinfo)) {
struct hfi1_vnic_vport_info *vinfo_tmp;
- int id_tmp = 0;
+ unsigned long index = 0;
- vinfo_tmp = idr_get_next(&dd->vnic.vesw_idr, &id_tmp);
+ vinfo_tmp = xa_find(&dd->vnic.vesws, &index, ULONG_MAX,
+ XA_PRESENT);
if (vinfo_tmp) {
spin_lock(&vport_cntr_lock);
vinfo_tmp->stats[0].netstats.rx_nohandler++;
@@ -596,8 +598,7 @@
if (!vinfo->vesw_id)
return -EINVAL;
- rc = idr_alloc(&dd->vnic.vesw_idr, vinfo, vinfo->vesw_id,
- vinfo->vesw_id + 1, GFP_NOWAIT);
+ rc = xa_insert(&dd->vnic.vesws, vinfo->vesw_id, vinfo, GFP_KERNEL);
if (rc < 0)
return rc;
@@ -623,10 +624,10 @@
clear_bit(HFI1_VNIC_UP, &vinfo->flags);
netif_carrier_off(vinfo->netdev);
netif_tx_disable(vinfo->netdev);
- idr_remove(&dd->vnic.vesw_idr, vinfo->vesw_id);
+ xa_erase(&dd->vnic.vesws, vinfo->vesw_id);
/* ensure irqs see the change */
- hfi1_vnic_synchronize_irq(dd);
+ msix_vnic_synchronize_irq(dd);
/* remove unread skbs */
for (i = 0; i < vinfo->num_rx_q; i++) {
@@ -690,8 +691,6 @@
rc = hfi1_vnic_txreq_init(dd);
if (rc)
goto txreq_fail;
-
- dd->vnic.msix_idx = dd->first_dyn_msix_idx;
}
for (i = dd->vnic.num_ctxt; i < vinfo->num_rx_q; i++) {
@@ -816,14 +815,14 @@
size = sizeof(struct opa_vnic_rdma_netdev) + sizeof(*vinfo);
netdev = alloc_netdev_mqs(size, name, name_assign_type, setup,
- chip_sdma_engines(dd), dd->num_vnic_contexts);
+ dd->num_sdma, dd->num_vnic_contexts);
if (!netdev)
return ERR_PTR(-ENOMEM);
rn = netdev_priv(netdev);
vinfo = opa_vnic_dev_priv(netdev);
vinfo->dd = dd;
- vinfo->num_tx_q = chip_sdma_engines(dd);
+ vinfo->num_tx_q = dd->num_sdma;
vinfo->num_rx_q = dd->num_vnic_contexts;
vinfo->netdev = netdev;
rn->free_rdma_netdev = hfi1_vnic_free_rn;
diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c
index c3c96c5..7d90b90 100644
--- a/drivers/infiniband/hw/hfi1/vnic_sdma.c
+++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c
@@ -1,5 +1,5 @@
/*
- * Copyright(c) 2017 Intel Corporation.
+ * Copyright(c) 2017 - 2018 Intel Corporation.
*
* This file is provided under a dual BSD/GPLv2 license. When using or
* redistributing this file, you may do so under either license.
@@ -57,7 +57,6 @@
#define HFI1_VNIC_TXREQ_NAME_LEN 32
#define HFI1_VNIC_SDMA_DESC_WTRMRK 64
-#define HFI1_VNIC_SDMA_RETRY_COUNT 1
/*
* struct vnic_txreq - VNIC transmit descriptor
@@ -67,7 +66,6 @@
* @pad: pad buffer
* @plen: pad length
* @pbc_val: pbc value
- * @retry_count: tx retry count
*/
struct vnic_txreq {
struct sdma_txreq txreq;
@@ -77,8 +75,6 @@
unsigned char pad[HFI1_VNIC_MAX_PAD];
u16 plen;
__le64 pbc_val;
-
- u32 retry_count;
};
static void vnic_sdma_complete(struct sdma_txreq *txreq,
@@ -106,13 +102,13 @@
goto bail_txadd;
for (i = 0; i < skb_shinfo(tx->skb)->nr_frags; i++) {
- struct skb_frag_struct *frag = &skb_shinfo(tx->skb)->frags[i];
+ skb_frag_t *frag = &skb_shinfo(tx->skb)->frags[i];
/* combine physically continuous fragments later? */
ret = sdma_txadd_page(sde->dd,
&tx->txreq,
skb_frag_page(frag),
- frag->page_offset,
+ skb_frag_off(frag),
skb_frag_size(frag));
if (unlikely(ret))
goto bail_txadd;
@@ -196,10 +192,9 @@
ret = build_vnic_tx_desc(sde, tx, pbc);
if (unlikely(ret))
goto free_desc;
- tx->retry_count = 0;
- ret = sdma_send_txreq(sde, &vnic_sdma->wait, &tx->txreq,
- vnic_sdma->pkts_sent);
+ ret = sdma_send_txreq(sde, iowait_get_ib_work(&vnic_sdma->wait),
+ &tx->txreq, vnic_sdma->pkts_sent);
/* When -ECOMM, sdma callback will be called with ABORT status */
if (unlikely(ret && unlikely(ret != -ECOMM)))
goto free_desc;
@@ -230,25 +225,26 @@
* become available.
*/
static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde,
- struct iowait *wait,
+ struct iowait_work *wait,
struct sdma_txreq *txreq,
uint seq,
bool pkts_sent)
{
struct hfi1_vnic_sdma *vnic_sdma =
- container_of(wait, struct hfi1_vnic_sdma, wait);
- struct hfi1_ibdev *dev = &vnic_sdma->dd->verbs_dev;
- struct vnic_txreq *tx = container_of(txreq, struct vnic_txreq, txreq);
+ container_of(wait->iow, struct hfi1_vnic_sdma, wait);
- if (sdma_progress(sde, seq, txreq))
- if (tx->retry_count++ < HFI1_VNIC_SDMA_RETRY_COUNT)
- return -EAGAIN;
+ write_seqlock(&sde->waitlock);
+ if (sdma_progress(sde, seq, txreq)) {
+ write_sequnlock(&sde->waitlock);
+ return -EAGAIN;
+ }
vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED;
- write_seqlock(&dev->iowait_lock);
- if (list_empty(&vnic_sdma->wait.list))
- iowait_queue(pkts_sent, wait, &sde->dmawait);
- write_sequnlock(&dev->iowait_lock);
+ if (list_empty(&vnic_sdma->wait.list)) {
+ iowait_get_priority(wait->iow);
+ iowait_queue(pkts_sent, wait->iow, &sde->dmawait);
+ }
+ write_sequnlock(&sde->waitlock);
return -EBUSY;
}
@@ -285,8 +281,9 @@
for (i = 0; i < vinfo->num_tx_q; i++) {
struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[i];
- iowait_init(&vnic_sdma->wait, 0, NULL, hfi1_vnic_sdma_sleep,
- hfi1_vnic_sdma_wakeup, NULL);
+ iowait_init(&vnic_sdma->wait, 0, NULL, NULL,
+ hfi1_vnic_sdma_sleep,
+ hfi1_vnic_sdma_wakeup, NULL, NULL);
vnic_sdma->sde = &vinfo->dd->per_sdma[i];
vnic_sdma->dd = vinfo->dd;
vnic_sdma->vinfo = vinfo;
@@ -295,10 +292,12 @@
/* Add a free descriptor watermark for wakeups */
if (vnic_sdma->sde->descq_cnt > HFI1_VNIC_SDMA_DESC_WTRMRK) {
+ struct iowait_work *work;
+
INIT_LIST_HEAD(&vnic_sdma->stx.list);
vnic_sdma->stx.num_desc = HFI1_VNIC_SDMA_DESC_WTRMRK;
- list_add_tail(&vnic_sdma->stx.list,
- &vnic_sdma->wait.tx_head);
+ work = iowait_get_ib_work(&vnic_sdma->wait);
+ list_add_tail(&vnic_sdma->stx.list, &work->tx_head);
}
}
}