Update Linux to v5.4.2

Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 902d12d..9b1fb84 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -67,8 +67,6 @@
 #include "debugfs.h"
 #include "fault.h"
 
-#define NUM_IB_PORTS 1
-
 uint kdeth_qp;
 module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
 MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
@@ -1074,6 +1072,8 @@
 static void log_physical_state(struct hfi1_pportdata *ppd, u32 state);
 static int wait_physical_linkstate(struct hfi1_pportdata *ppd, u32 state,
 				   int msecs);
+static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd,
+					 int msecs);
 static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc);
 static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr);
 static void handle_temp_err(struct hfi1_devdata *dd);
@@ -1100,9 +1100,9 @@
 	const char *desc;
 };
 
-#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START)
-#define NUM_DC_ERRS (IS_DC_END - IS_DC_START)
-#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START)
+#define NUM_MISC_ERRS (IS_GENERAL_ERR_END + 1 - IS_GENERAL_ERR_START)
+#define NUM_DC_ERRS (IS_DC_END + 1 - IS_DC_START)
+#define NUM_VARIOUS (IS_VARIOUS_END + 1 - IS_VARIOUS_START)
 
 /*
  * Helpers for building HFI and DC error interrupt table entries.  Different
@@ -4101,9 +4101,13 @@
 def_access_ibp_counter(rdma_seq);
 def_access_ibp_counter(unaligned);
 def_access_ibp_counter(seq_naks);
+def_access_ibp_counter(rc_crwaits);
 
 static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
 [C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH),
+[C_RX_LEN_ERR] = RXE32_DEV_CNTR_ELEM(RxLenErr, RCV_LENGTH_ERR_CNT, CNTR_SYNTH),
+[C_RX_ICRC_ERR] = RXE32_DEV_CNTR_ELEM(RxICrcErr, RCV_ICRC_ERR_CNT, CNTR_SYNTH),
+[C_RX_EBP] = RXE32_DEV_CNTR_ELEM(RxEbpCnt, RCV_EBP_CNT, CNTR_SYNTH),
 [C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT,
 			CNTR_NORMAL),
 [C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT,
@@ -4253,6 +4257,8 @@
 			    access_sw_pio_drain),
 [C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL,
 			    access_sw_kmem_wait),
+[C_SW_TID_WAIT] = CNTR_ELEM("TidWait", 0, 0, CNTR_NORMAL,
+			    hfi1_access_sw_tid_wait),
 [C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL,
 			    access_sw_send_schedule),
 [C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn",
@@ -5114,6 +5120,7 @@
 [C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq),
 [C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned),
 [C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks),
+[C_SW_IBP_RC_CRWAITS] = SW_IBP_CNTR(RcCrWait, rc_crwaits),
 [C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL,
 			       access_sw_cpu_rc_acks),
 [C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL,
@@ -5222,6 +5229,17 @@
 	return (chip_rev_minor & 0xF0) == 0x10;
 }
 
+/* return true is kernel urg disabled for rcd */
+bool is_urg_masked(struct hfi1_ctxtdata *rcd)
+{
+	u64 mask;
+	u32 is = IS_RCVURGENT_START + rcd->ctxt;
+	u8 bit = is % 64;
+
+	mask = read_csr(rcd->dd, CCE_INT_MASK + (8 * (is / 64)));
+	return !(mask & BIT_ULL(bit));
+}
+
 /*
  * Append string s to buffer buf.  Arguments curp and len are the current
  * position and remaining length, respectively.
@@ -8181,7 +8199,7 @@
 /**
  * is_rcv_urgent_int() - User receive context urgent IRQ handler
  * @dd: valid dd
- * @source: logical IRQ source (ofse from IS_RCVURGENT_START)
+ * @source: logical IRQ source (offset from IS_RCVURGENT_START)
  *
  * RX block receive urgent interrupt.  Source is < 160.
  *
@@ -8231,7 +8249,7 @@
 				is_sdma_eng_err_name,	is_sdma_eng_err_int },
 { IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END,
 				is_sendctxt_err_name,	is_sendctxt_err_int },
-{ IS_SDMA_START,	     IS_SDMA_END,
+{ IS_SDMA_START,	     IS_SDMA_IDLE_END,
 				is_sdma_eng_name,	is_sdma_eng_int },
 { IS_VARIOUS_START,	     IS_VARIOUS_END,
 				is_various_name,	is_various_int },
@@ -8257,7 +8275,7 @@
 
 	/* avoids a double compare by walking the table in-order */
 	for (entry = &is_table[0]; entry->is_name; entry++) {
-		if (source < entry->end) {
+		if (source <= entry->end) {
 			trace_hfi1_interrupt(dd, entry, source);
 			entry->is_int(dd, source - entry->start);
 			return;
@@ -8276,7 +8294,7 @@
  * context DATA IRQs are threaded and are not supported by this handler.
  *
  */
-static irqreturn_t general_interrupt(int irq, void *data)
+irqreturn_t general_interrupt(int irq, void *data)
 {
 	struct hfi1_devdata *dd = data;
 	u64 regs[CCE_NUM_INT_CSRS];
@@ -8309,7 +8327,7 @@
 	return handled;
 }
 
-static irqreturn_t sdma_interrupt(int irq, void *data)
+irqreturn_t sdma_interrupt(int irq, void *data)
 {
 	struct sdma_engine *sde = data;
 	struct hfi1_devdata *dd = sde->dd;
@@ -8352,7 +8370,6 @@
 	struct hfi1_devdata *dd = rcd->dd;
 	u32 addr = CCE_INT_CLEAR + (8 * rcd->ireg);
 
-	mmiowb();	/* make sure everything before is written */
 	write_csr(dd, addr, rcd->imask);
 	/* force the above write on the chip and get a value back */
 	(void)read_csr(dd, addr);
@@ -8401,7 +8418,7 @@
  * invoked) is finished.  The intent is to avoid extra interrupts while we
  * are processing packets anyway.
  */
-static irqreturn_t receive_context_interrupt(int irq, void *data)
+irqreturn_t receive_context_interrupt(int irq, void *data)
 {
 	struct hfi1_ctxtdata *rcd = data;
 	struct hfi1_devdata *dd = rcd->dd;
@@ -8441,7 +8458,7 @@
  * Receive packet thread handler.  This expects to be invoked with the
  * receive interrupt still blocked.
  */
-static irqreturn_t receive_context_thread(int irq, void *data)
+irqreturn_t receive_context_thread(int irq, void *data)
 {
 	struct hfi1_ctxtdata *rcd = data;
 	int present;
@@ -9651,30 +9668,10 @@
 	}
 }
 
-static void init_qsfp_int(struct hfi1_devdata *dd)
+void init_qsfp_int(struct hfi1_devdata *dd)
 {
 	struct hfi1_pportdata *ppd = dd->pport;
-	u64 qsfp_mask, cce_int_mask;
-	const int qsfp1_int_smask = QSFP1_INT % 64;
-	const int qsfp2_int_smask = QSFP2_INT % 64;
-
-	/*
-	 * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0
-	 * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR,
-	 * therefore just one of QSFP1_INT/QSFP2_INT can be used to find
-	 * the index of the appropriate CSR in the CCEIntMask CSR array
-	 */
-	cce_int_mask = read_csr(dd, CCE_INT_MASK +
-				(8 * (QSFP1_INT / 64)));
-	if (dd->hfi1_id) {
-		cce_int_mask &= ~((u64)1 << qsfp1_int_smask);
-		write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)),
-			  cce_int_mask);
-	} else {
-		cce_int_mask &= ~((u64)1 << qsfp2_int_smask);
-		write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)),
-			  cce_int_mask);
-	}
+	u64 qsfp_mask;
 
 	qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N);
 	/* Clear current status to avoid spurious interrupts */
@@ -9691,6 +9688,12 @@
 	write_csr(dd,
 		  dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT,
 		  qsfp_mask);
+
+	/* Enable the appropriate QSFP IRQ source */
+	if (!dd->hfi1_id)
+		set_intr_bits(dd, QSFP1_INT, QSFP1_INT, true);
+	else
+		set_intr_bits(dd, QSFP2_INT, QSFP2_INT, true);
 }
 
 /*
@@ -9849,6 +9852,7 @@
 
 	/* disable the port */
 	clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK);
+	cancel_work_sync(&ppd->freeze_work);
 }
 
 static inline int init_cpu_counters(struct hfi1_devdata *dd)
@@ -10577,12 +10581,29 @@
 	}
 }
 
-/*
- * Verify if BCT for data VLs is non-zero.
+/**
+ * data_vls_operational() - Verify if data VL BCT credits and MTU
+ *			    are both set.
+ * @ppd: pointer to hfi1_pportdata structure
+ *
+ * Return: true - Ok, false -otherwise.
  */
 static inline bool data_vls_operational(struct hfi1_pportdata *ppd)
 {
-	return !!ppd->actual_vls_operational;
+	int i;
+	u64 reg;
+
+	if (!ppd->actual_vls_operational)
+		return false;
+
+	for (i = 0; i < ppd->vls_supported; i++) {
+		reg = read_csr(ppd->dd, SEND_CM_CREDIT_VL + (8 * i));
+		if ((reg && !ppd->dd->vld[i].mtu) ||
+		    (!reg && ppd->dd->vld[i].mtu))
+			return false;
+	}
+
+	return true;
 }
 
 /*
@@ -10695,7 +10716,8 @@
 
 		if (!data_vls_operational(ppd)) {
 			dd_dev_err(dd,
-				   "%s: data VLs not operational\n", __func__);
+				   "%s: Invalid data VL credits or mtu\n",
+				   __func__);
 			ret = -EINVAL;
 			break;
 		}
@@ -10768,13 +10790,15 @@
 			break;
 
 		ppd->port_error_action = 0;
-		ppd->host_link_state = HLS_DN_POLL;
 
 		if (quick_linkup) {
 			/* quick linkup does not go into polling */
 			ret = do_quick_linkup(dd);
 		} else {
 			ret1 = set_physical_link_state(dd, PLS_POLLING);
+			if (!ret1)
+				ret1 = wait_phys_link_out_of_offline(ppd,
+								     3000);
 			if (ret1 != HCMD_SUCCESS) {
 				dd_dev_err(dd,
 					   "Failed to transition to Polling link state, return 0x%x\n",
@@ -10782,6 +10806,14 @@
 				ret = -EINVAL;
 			}
 		}
+
+		/*
+		 * Change the host link state after requesting DC8051 to
+		 * change its physical state so that we can ignore any
+		 * interrupt with stale LNI(XX) error, which will not be
+		 * cleared until DC8051 transitions to Polling state.
+		 */
+		ppd->host_link_state = HLS_DN_POLL;
 		ppd->offline_disabled_reason =
 			HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE);
 		/*
@@ -11776,12 +11808,10 @@
 			<< RCV_EGR_INDEX_HEAD_HEAD_SHIFT;
 		write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg);
 	}
-	mmiowb();
 	reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) |
 		(((u64)hd & RCV_HDR_HEAD_HEAD_MASK)
 			<< RCV_HDR_HEAD_HEAD_SHIFT);
 	write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg);
-	mmiowb();
 }
 
 u32 hdrqempty(struct hfi1_ctxtdata *rcd)
@@ -11932,10 +11962,16 @@
 
 		rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK;
 	}
-	if (op & HFI1_RCVCTRL_INTRAVAIL_ENB)
+	if (op & HFI1_RCVCTRL_INTRAVAIL_ENB) {
+		set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt,
+			      IS_RCVAVAIL_START + rcd->ctxt, true);
 		rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
-	if (op & HFI1_RCVCTRL_INTRAVAIL_DIS)
+	}
+	if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) {
+		set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt,
+			      IS_RCVAVAIL_START + rcd->ctxt, false);
 		rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
+	}
 	if ((op & HFI1_RCVCTRL_TAILUPD_ENB) && rcd->rcvhdrtail_kvaddr)
 		rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
 	if (op & HFI1_RCVCTRL_TAILUPD_DIS) {
@@ -11965,6 +12001,13 @@
 		rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
 	if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS)
 		rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK;
+	if (op & HFI1_RCVCTRL_URGENT_ENB)
+		set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt,
+			      IS_RCVURGENT_START + rcd->ctxt, true);
+	if (op & HFI1_RCVCTRL_URGENT_DIS)
+		set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt,
+			      IS_RCVURGENT_START + rcd->ctxt, false);
+
 	hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl);
 	write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcvctrl);
 
@@ -12913,6 +12956,39 @@
 	return read_state;
 }
 
+/*
+ * wait_phys_link_out_of_offline - wait for any out of offline state
+ * @ppd: port device
+ * @msecs: the number of milliseconds to wait
+ *
+ * Wait up to msecs milliseconds for any out of offline physical link
+ * state change to occur.
+ * Returns 0 if at least one state is reached, otherwise -ETIMEDOUT.
+ */
+static int wait_phys_link_out_of_offline(struct hfi1_pportdata *ppd,
+					 int msecs)
+{
+	u32 read_state;
+	unsigned long timeout;
+
+	timeout = jiffies + msecs_to_jiffies(msecs);
+	while (1) {
+		read_state = read_physical_state(ppd->dd);
+		if ((read_state & 0xF0) != PLS_OFFLINE)
+			break;
+		if (time_after(jiffies, timeout)) {
+			dd_dev_err(ppd->dd,
+				   "timeout waiting for phy link out of offline. Read state 0x%x, %dms\n",
+				   read_state, msecs);
+			return -ETIMEDOUT;
+		}
+		usleep_range(1950, 2050); /* sleep 2ms-ish */
+	}
+
+	log_state_transition(ppd, read_state);
+	return read_state;
+}
+
 #define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \
 (r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK)
 
@@ -12964,63 +13040,71 @@
 	return ret;
 }
 
-/**
- * get_int_mask - get 64 bit int mask
- * @dd - the devdata
- * @i - the csr (relative to CCE_INT_MASK)
- *
- * Returns the mask with the urgent interrupt mask
- * bit clear for kernel receive contexts.
- */
-static u64 get_int_mask(struct hfi1_devdata *dd, u32 i)
-{
-	u64 mask = U64_MAX; /* default to no change */
-
-	if (i >= (IS_RCVURGENT_START / 64) && i < (IS_RCVURGENT_END / 64)) {
-		int j = (i - (IS_RCVURGENT_START / 64)) * 64;
-		int k = !j ? IS_RCVURGENT_START % 64 : 0;
-
-		if (j)
-			j -= IS_RCVURGENT_START % 64;
-		/* j = 0..dd->first_dyn_alloc_ctxt - 1,k = 0..63 */
-		for (; j < dd->first_dyn_alloc_ctxt && k < 64; j++, k++)
-			/* convert to bit in mask and clear */
-			mask &= ~BIT_ULL(k);
-	}
-	return mask;
-}
-
 /* ========================================================================= */
 
-/*
- * Enable/disable chip from delivering interrupts.
+/**
+ * read_mod_write() - Calculate the IRQ register index and set/clear the bits
+ * @dd: valid devdata
+ * @src: IRQ source to determine register index from
+ * @bits: the bits to set or clear
+ * @set: true == set the bits, false == clear the bits
+ *
  */
-void set_intr_state(struct hfi1_devdata *dd, u32 enable)
+static void read_mod_write(struct hfi1_devdata *dd, u16 src, u64 bits,
+			   bool set)
 {
-	int i;
+	u64 reg;
+	u16 idx = src / BITS_PER_REGISTER;
 
-	/*
-	 * In HFI, the mask needs to be 1 to allow interrupts.
-	 */
-	if (enable) {
-		/* enable all interrupts but urgent on kernel contexts */
-		for (i = 0; i < CCE_NUM_INT_CSRS; i++) {
-			u64 mask = get_int_mask(dd, i);
+	spin_lock(&dd->irq_src_lock);
+	reg = read_csr(dd, CCE_INT_MASK + (8 * idx));
+	if (set)
+		reg |= bits;
+	else
+		reg &= ~bits;
+	write_csr(dd, CCE_INT_MASK + (8 * idx), reg);
+	spin_unlock(&dd->irq_src_lock);
+}
 
-			write_csr(dd, CCE_INT_MASK + (8 * i), mask);
+/**
+ * set_intr_bits() - Enable/disable a range (one or more) IRQ sources
+ * @dd: valid devdata
+ * @first: first IRQ source to set/clear
+ * @last: last IRQ source (inclusive) to set/clear
+ * @set: true == set the bits, false == clear the bits
+ *
+ * If first == last, set the exact source.
+ */
+int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set)
+{
+	u64 bits = 0;
+	u64 bit;
+	u16 src;
+
+	if (first > NUM_INTERRUPT_SOURCES || last > NUM_INTERRUPT_SOURCES)
+		return -EINVAL;
+
+	if (last < first)
+		return -ERANGE;
+
+	for (src = first; src <= last; src++) {
+		bit = src % BITS_PER_REGISTER;
+		/* wrapped to next register? */
+		if (!bit && bits) {
+			read_mod_write(dd, src - 1, bits, set);
+			bits = 0;
 		}
-
-		init_qsfp_int(dd);
-	} else {
-		for (i = 0; i < CCE_NUM_INT_CSRS; i++)
-			write_csr(dd, CCE_INT_MASK + (8 * i), 0ull);
+		bits |= BIT_ULL(bit);
 	}
+	read_mod_write(dd, last, bits, set);
+
+	return 0;
 }
 
 /*
  * Clear all interrupt sources on the chip.
  */
-static void clear_all_interrupts(struct hfi1_devdata *dd)
+void clear_all_interrupts(struct hfi1_devdata *dd)
 {
 	int i;
 
@@ -13044,38 +13128,11 @@
 	write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0);
 }
 
-/**
- * hfi1_clean_up_interrupts() - Free all IRQ resources
- * @dd: valid device data data structure
- *
- * Free the MSIx and assoicated PCI resources, if they have been allocated.
- */
-void hfi1_clean_up_interrupts(struct hfi1_devdata *dd)
-{
-	int i;
-	struct hfi1_msix_entry *me = dd->msix_entries;
-
-	/* remove irqs - must happen before disabling/turning off */
-	for (i = 0; i < dd->num_msix_entries; i++, me++) {
-		if (!me->arg) /* => no irq, no affinity */
-			continue;
-		hfi1_put_irq_affinity(dd, me);
-		pci_free_irq(dd->pcidev, i, me->arg);
-	}
-
-	/* clean structures */
-	kfree(dd->msix_entries);
-	dd->msix_entries = NULL;
-	dd->num_msix_entries = 0;
-
-	pci_free_irq_vectors(dd->pcidev);
-}
-
 /*
  * Remap the interrupt source from the general handler to the given MSI-X
  * interrupt.
  */
-static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
+void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr)
 {
 	u64 reg;
 	int m, n;
@@ -13099,8 +13156,7 @@
 	write_csr(dd, CCE_INT_MAP + (8 * m), reg);
 }
 
-static void remap_sdma_interrupts(struct hfi1_devdata *dd,
-				  int engine, int msix_intr)
+void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr)
 {
 	/*
 	 * SDMA engine interrupt sources grouped by type, rather than
@@ -13109,204 +13165,16 @@
 	 *	SDMAProgress
 	 *	SDMAIdle
 	 */
-	remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine,
-		   msix_intr);
-	remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine,
-		   msix_intr);
-	remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine,
-		   msix_intr);
-}
-
-static int request_msix_irqs(struct hfi1_devdata *dd)
-{
-	int first_general, last_general;
-	int first_sdma, last_sdma;
-	int first_rx, last_rx;
-	int i, ret = 0;
-
-	/* calculate the ranges we are going to use */
-	first_general = 0;
-	last_general = first_general + 1;
-	first_sdma = last_general;
-	last_sdma = first_sdma + dd->num_sdma;
-	first_rx = last_sdma;
-	last_rx = first_rx + dd->n_krcv_queues + dd->num_vnic_contexts;
-
-	/* VNIC MSIx interrupts get mapped when VNIC contexts are created */
-	dd->first_dyn_msix_idx = first_rx + dd->n_krcv_queues;
-
-	/*
-	 * Sanity check - the code expects all SDMA chip source
-	 * interrupts to be in the same CSR, starting at bit 0.  Verify
-	 * that this is true by checking the bit location of the start.
-	 */
-	BUILD_BUG_ON(IS_SDMA_START % 64);
-
-	for (i = 0; i < dd->num_msix_entries; i++) {
-		struct hfi1_msix_entry *me = &dd->msix_entries[i];
-		const char *err_info;
-		irq_handler_t handler;
-		irq_handler_t thread = NULL;
-		void *arg = NULL;
-		int idx;
-		struct hfi1_ctxtdata *rcd = NULL;
-		struct sdma_engine *sde = NULL;
-		char name[MAX_NAME_SIZE];
-
-		/* obtain the arguments to pci_request_irq */
-		if (first_general <= i && i < last_general) {
-			idx = i - first_general;
-			handler = general_interrupt;
-			arg = dd;
-			snprintf(name, sizeof(name),
-				 DRIVER_NAME "_%d", dd->unit);
-			err_info = "general";
-			me->type = IRQ_GENERAL;
-		} else if (first_sdma <= i && i < last_sdma) {
-			idx = i - first_sdma;
-			sde = &dd->per_sdma[idx];
-			handler = sdma_interrupt;
-			arg = sde;
-			snprintf(name, sizeof(name),
-				 DRIVER_NAME "_%d sdma%d", dd->unit, idx);
-			err_info = "sdma";
-			remap_sdma_interrupts(dd, idx, i);
-			me->type = IRQ_SDMA;
-		} else if (first_rx <= i && i < last_rx) {
-			idx = i - first_rx;
-			rcd = hfi1_rcd_get_by_index_safe(dd, idx);
-			if (rcd) {
-				/*
-				 * Set the interrupt register and mask for this
-				 * context's interrupt.
-				 */
-				rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
-				rcd->imask = ((u64)1) <<
-					  ((IS_RCVAVAIL_START + idx) % 64);
-				handler = receive_context_interrupt;
-				thread = receive_context_thread;
-				arg = rcd;
-				snprintf(name, sizeof(name),
-					 DRIVER_NAME "_%d kctxt%d",
-					 dd->unit, idx);
-				err_info = "receive context";
-				remap_intr(dd, IS_RCVAVAIL_START + idx, i);
-				me->type = IRQ_RCVCTXT;
-				rcd->msix_intr = i;
-				hfi1_rcd_put(rcd);
-			}
-		} else {
-			/* not in our expected range - complain, then
-			 * ignore it
-			 */
-			dd_dev_err(dd,
-				   "Unexpected extra MSI-X interrupt %d\n", i);
-			continue;
-		}
-		/* no argument, no interrupt */
-		if (!arg)
-			continue;
-		/* make sure the name is terminated */
-		name[sizeof(name) - 1] = 0;
-		me->irq = pci_irq_vector(dd->pcidev, i);
-		ret = pci_request_irq(dd->pcidev, i, handler, thread, arg,
-				      name);
-		if (ret) {
-			dd_dev_err(dd,
-				   "unable to allocate %s interrupt, irq %d, index %d, err %d\n",
-				   err_info, me->irq, idx, ret);
-			return ret;
-		}
-		/*
-		 * assign arg after pci_request_irq call, so it will be
-		 * cleaned up
-		 */
-		me->arg = arg;
-
-		ret = hfi1_get_irq_affinity(dd, me);
-		if (ret)
-			dd_dev_err(dd, "unable to pin IRQ %d\n", ret);
-	}
-
-	return ret;
-}
-
-void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd)
-{
-	int i;
-
-	for (i = 0; i < dd->vnic.num_ctxt; i++) {
-		struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i];
-		struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr];
-
-		synchronize_irq(me->irq);
-	}
-}
-
-void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd)
-{
-	struct hfi1_devdata *dd = rcd->dd;
-	struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr];
-
-	if (!me->arg) /* => no irq, no affinity */
-		return;
-
-	hfi1_put_irq_affinity(dd, me);
-	pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg);
-
-	me->arg = NULL;
-}
-
-void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd)
-{
-	struct hfi1_devdata *dd = rcd->dd;
-	struct hfi1_msix_entry *me;
-	int idx = rcd->ctxt;
-	void *arg = rcd;
-	int ret;
-
-	rcd->msix_intr = dd->vnic.msix_idx++;
-	me = &dd->msix_entries[rcd->msix_intr];
-
-	/*
-	 * Set the interrupt register and mask for this
-	 * context's interrupt.
-	 */
-	rcd->ireg = (IS_RCVAVAIL_START + idx) / 64;
-	rcd->imask = ((u64)1) <<
-		  ((IS_RCVAVAIL_START + idx) % 64);
-	me->type = IRQ_RCVCTXT;
-	me->irq = pci_irq_vector(dd->pcidev, rcd->msix_intr);
-	remap_intr(dd, IS_RCVAVAIL_START + idx, rcd->msix_intr);
-
-	ret = pci_request_irq(dd->pcidev, rcd->msix_intr,
-			      receive_context_interrupt,
-			      receive_context_thread, arg,
-			      DRIVER_NAME "_%d kctxt%d", dd->unit, idx);
-	if (ret) {
-		dd_dev_err(dd, "vnic irq request (irq %d, idx %d) fail %d\n",
-			   me->irq, idx, ret);
-		return;
-	}
-	/*
-	 * assign arg after pci_request_irq call, so it will be
-	 * cleaned up
-	 */
-	me->arg = arg;
-
-	ret = hfi1_get_irq_affinity(dd, me);
-	if (ret) {
-		dd_dev_err(dd,
-			   "unable to pin IRQ %d\n", ret);
-		pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg);
-	}
+	remap_intr(dd, IS_SDMA_START + engine, msix_intr);
+	remap_intr(dd, IS_SDMA_PROGRESS_START + engine, msix_intr);
+	remap_intr(dd, IS_SDMA_IDLE_START + engine, msix_intr);
 }
 
 /*
  * Set the general handler to accept all interrupts, remap all
  * chip interrupts back to MSI-X 0.
  */
-static void reset_interrupts(struct hfi1_devdata *dd)
+void reset_interrupts(struct hfi1_devdata *dd)
 {
 	int i;
 
@@ -13319,54 +13187,33 @@
 		write_csr(dd, CCE_INT_MAP + (8 * i), 0);
 }
 
+/**
+ * set_up_interrupts() - Initialize the IRQ resources and state
+ * @dd: valid devdata
+ *
+ */
 static int set_up_interrupts(struct hfi1_devdata *dd)
 {
-	u32 total;
-	int ret, request;
-
-	/*
-	 * Interrupt count:
-	 *	1 general, "slow path" interrupt (includes the SDMA engines
-	 *		slow source, SDMACleanupDone)
-	 *	N interrupts - one per used SDMA engine
-	 *	M interrupt - one per kernel receive context
-	 *	V interrupt - one for each VNIC context
-	 */
-	total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts;
-
-	/* ask for MSI-X interrupts */
-	request = request_msix(dd, total);
-	if (request < 0) {
-		ret = request;
-		goto fail;
-	} else {
-		dd->msix_entries = kcalloc(total, sizeof(*dd->msix_entries),
-					   GFP_KERNEL);
-		if (!dd->msix_entries) {
-			ret = -ENOMEM;
-			goto fail;
-		}
-		/* using MSI-X */
-		dd->num_msix_entries = total;
-		dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total);
-	}
+	int ret;
 
 	/* mask all interrupts */
-	set_intr_state(dd, 0);
+	set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false);
+
 	/* clear all pending interrupts */
 	clear_all_interrupts(dd);
 
 	/* reset general handler mask, chip MSI-X mappings */
 	reset_interrupts(dd);
 
-	ret = request_msix_irqs(dd);
+	/* ask for MSI-X interrupts */
+	ret = msix_initialize(dd);
 	if (ret)
-		goto fail;
+		return ret;
 
-	return 0;
+	ret = msix_request_irqs(dd);
+	if (ret)
+		msix_clean_up_interrupts(dd);
 
-fail:
-	hfi1_clean_up_interrupts(dd);
 	return ret;
 }
 
@@ -13388,7 +13235,7 @@
 	int total_contexts;
 	int ret;
 	unsigned ngroups;
-	int qos_rmt_count;
+	int rmt_count;
 	int user_rmt_reduced;
 	u32 n_usr_ctxts;
 	u32 send_contexts = chip_send_contexts(dd);
@@ -13450,10 +13297,23 @@
 		n_usr_ctxts = rcv_contexts - total_contexts;
 	}
 
-	/* each user context requires an entry in the RMT */
-	qos_rmt_count = qos_rmt_entries(dd, NULL, NULL);
-	if (qos_rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
-		user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count;
+	/*
+	 * The RMT entries are currently allocated as shown below:
+	 * 1. QOS (0 to 128 entries);
+	 * 2. FECN (num_kernel_context - 1 + num_user_contexts +
+	 *    num_vnic_contexts);
+	 * 3. VNIC (num_vnic_contexts).
+	 * It should be noted that FECN oversubscribe num_vnic_contexts
+	 * entries of RMT because both VNIC and PSM could allocate any receive
+	 * context between dd->first_dyn_alloc_text and dd->num_rcv_contexts,
+	 * and PSM FECN must reserve an RMT entry for each possible PSM receive
+	 * context.
+	 */
+	rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_vnic_contexts * 2);
+	if (HFI1_CAP_IS_KSET(TID_RDMA))
+		rmt_count += num_kernel_contexts - 1;
+	if (rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
+		user_rmt_reduced = NUM_MAP_ENTRIES - rmt_count;
 		dd_dev_err(dd,
 			   "RMT size is reducing the number of user receive contexts from %u to %d\n",
 			   n_usr_ctxts,
@@ -14174,6 +14034,19 @@
 }
 
 /**
+ * hfi1_get_qp_map
+ * @dd: device data
+ * @idx: index to read
+ */
+u8 hfi1_get_qp_map(struct hfi1_devdata *dd, u8 idx)
+{
+	u64 reg = read_csr(dd, RCV_QP_MAP_TABLE + (idx / 8) * 8);
+
+	reg >>= (idx % 8) * 8;
+	return reg;
+}
+
+/**
  * init_qpmap_table
  * @dd - device data
  * @first_ctxt - first context
@@ -14434,35 +14307,43 @@
 	init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
 }
 
-static void init_user_fecn_handling(struct hfi1_devdata *dd,
-				    struct rsm_map_table *rmt)
+static void init_fecn_handling(struct hfi1_devdata *dd,
+			       struct rsm_map_table *rmt)
 {
 	struct rsm_rule_data rrd;
 	u64 reg;
-	int i, idx, regoff, regidx;
+	int i, idx, regoff, regidx, start;
 	u8 offset;
+	u32 total_cnt;
+
+	if (HFI1_CAP_IS_KSET(TID_RDMA))
+		/* Exclude context 0 */
+		start = 1;
+	else
+		start = dd->first_dyn_alloc_ctxt;
+
+	total_cnt = dd->num_rcv_contexts - start;
 
 	/* there needs to be enough room in the map table */
-	if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) {
-		dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+	if (rmt->used + total_cnt >= NUM_MAP_ENTRIES) {
+		dd_dev_err(dd, "FECN handling disabled - too many contexts allocated\n");
 		return;
 	}
 
 	/*
 	 * RSM will extract the destination context as an index into the
 	 * map table.  The destination contexts are a sequential block
-	 * in the range first_dyn_alloc_ctxt...num_rcv_contexts-1 (inclusive).
+	 * in the range start...num_rcv_contexts-1 (inclusive).
 	 * Map entries are accessed as offset + extracted value.  Adjust
 	 * the added offset so this sequence can be placed anywhere in
 	 * the table - as long as the entries themselves do not wrap.
 	 * There are only enough bits in offset for the table size, so
 	 * start with that to allow for a "negative" offset.
 	 */
-	offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
-						(int)dd->first_dyn_alloc_ctxt);
+	offset = (u8)(NUM_MAP_ENTRIES + rmt->used - start);
 
-	for (i = dd->first_dyn_alloc_ctxt, idx = rmt->used;
-				i < dd->num_rcv_contexts; i++, idx++) {
+	for (i = start, idx = rmt->used; i < dd->num_rcv_contexts;
+	     i++, idx++) {
 		/* replace with identity mapping */
 		regoff = (idx % 8) * 8;
 		regidx = idx / 8;
@@ -14497,7 +14378,7 @@
 	/* add rule 1 */
 	add_rsm_rule(dd, RSM_INS_FECN, &rrd);
 
-	rmt->used += dd->num_user_contexts;
+	rmt->used += total_cnt;
 }
 
 /* Initialize RSM for VNIC */
@@ -14573,7 +14454,7 @@
 		clear_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
 }
 
-static void init_rxe(struct hfi1_devdata *dd)
+static int init_rxe(struct hfi1_devdata *dd)
 {
 	struct rsm_map_table *rmt;
 	u64 val;
@@ -14582,9 +14463,12 @@
 	write_csr(dd, RCV_ERR_MASK, ~0ull);
 
 	rmt = alloc_rsm_map_table(dd);
+	if (!rmt)
+		return -ENOMEM;
+
 	/* set up QOS, including the QPN map table */
 	init_qos(dd, rmt);
-	init_user_fecn_handling(dd, rmt);
+	init_fecn_handling(dd, rmt);
 	complete_rsm_map_table(dd, rmt);
 	/* record number of used rsm map entries for vnic */
 	dd->vnic.rmt_start = rmt->used;
@@ -14608,6 +14492,7 @@
 	val |= ((4ull & RCV_BYPASS_HDR_SIZE_MASK) <<
 		RCV_BYPASS_HDR_SIZE_SHIFT);
 	write_csr(dd, RCV_BYPASS, val);
+	return 0;
 }
 
 static void init_other(struct hfi1_devdata *dd)
@@ -14810,8 +14695,8 @@
  */
 static int init_asic_data(struct hfi1_devdata *dd)
 {
-	unsigned long flags;
-	struct hfi1_devdata *tmp, *peer = NULL;
+	unsigned long index;
+	struct hfi1_devdata *peer;
 	struct hfi1_asic_data *asic_data;
 	int ret = 0;
 
@@ -14820,14 +14705,12 @@
 	if (!asic_data)
 		return -ENOMEM;
 
-	spin_lock_irqsave(&hfi1_devs_lock, flags);
+	xa_lock_irq(&hfi1_dev_table);
 	/* Find our peer device */
-	list_for_each_entry(tmp, &hfi1_dev_list, list) {
-		if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) &&
-		    dd->unit != tmp->unit) {
-			peer = tmp;
+	xa_for_each(&hfi1_dev_table, index, peer) {
+		if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(peer)) &&
+		    dd->unit != peer->unit)
 			break;
-		}
 	}
 
 	if (peer) {
@@ -14839,7 +14722,7 @@
 		mutex_init(&dd->asic_data->asic_resource_mutex);
 	}
 	dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */
-	spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+	xa_unlock_irq(&hfi1_dev_table);
 
 	/* first one through - set up i2c devices */
 	if (!peer)
@@ -14919,20 +14802,16 @@
 }
 
 /**
- * Allocate and initialize the device structure for the hfi.
+ * hfi1_init_dd() - Initialize most of the dd structure.
  * @dev: the pci_dev for hfi1_ib device
  * @ent: pci_device_id struct for this dev
  *
- * Also allocates, initializes, and returns the devdata struct for this
- * device instance
- *
  * This is global, and is called directly at init to set up the
  * chip-specific function pointers for later use.
  */
-struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
-				  const struct pci_device_id *ent)
+int hfi1_init_dd(struct hfi1_devdata *dd)
 {
-	struct hfi1_devdata *dd;
+	struct pci_dev *pdev = dd->pcidev;
 	struct hfi1_pportdata *ppd;
 	u64 reg;
 	int i, ret;
@@ -14943,13 +14822,8 @@
 		"Functional simulator"
 	};
 	struct pci_dev *parent = pdev->bus->self;
-	u32 sdma_engines;
+	u32 sdma_engines = chip_sdma_engines(dd);
 
-	dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS *
-				sizeof(struct hfi1_pportdata));
-	if (IS_ERR(dd))
-		goto bail;
-	sdma_engines = chip_sdma_engines(dd);
 	ppd = dd->pport;
 	for (i = 0; i < dd->num_pports; i++, ppd++) {
 		int vl;
@@ -15128,6 +15002,12 @@
 	if (ret)
 		goto bail_cleanup;
 
+	/*
+	 * This should probably occur in hfi1_pcie_init(), but historically
+	 * occurs after the do_pcie_gen3_transition() code.
+	 */
+	tune_pcie_caps(dd);
+
 	/* start setting dd values and adjusting CSRs */
 	init_early_variables(dd);
 
@@ -15150,7 +15030,10 @@
 		goto bail_cleanup;
 
 	/* set initial RXE CSRs */
-	init_rxe(dd);
+	ret = init_rxe(dd);
+	if (ret)
+		goto bail_cleanup;
+
 	/* set initial TXE CSRs */
 	init_txe(dd);
 	/* set initial non-RXE, non-TXE CSRs */
@@ -15240,14 +15123,13 @@
 	free_cntrs(dd);
 bail_clear_intr:
 	hfi1_comp_vectors_clean_up(dd);
-	hfi1_clean_up_interrupts(dd);
+	msix_clean_up_interrupts(dd);
 bail_cleanup:
 	hfi1_pcie_ddcleanup(dd);
 bail_free:
 	hfi1_free_devdata(dd);
-	dd = ERR_PTR(ret);
 bail:
-	return dd;
+	return ret;
 }
 
 static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate,