Update Linux to v5.10.109

Sourced from [1]

[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz

Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index 433fca5..0aeccd9 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_INFINIBAND_MTHCA)		+= mthca/
 obj-$(CONFIG_INFINIBAND_QIB)		+= qib/
-obj-$(CONFIG_INFINIBAND_CXGB3)		+= cxgb3/
 obj-$(CONFIG_INFINIBAND_CXGB4)		+= cxgb4/
 obj-$(CONFIG_INFINIBAND_EFA)		+= efa/
 obj-$(CONFIG_INFINIBAND_I40IW)		+= i40iw/
diff --git a/drivers/infiniband/hw/bnxt_re/Kconfig b/drivers/infiniband/hw/bnxt_re/Kconfig
index ab8779d..0feac51 100644
--- a/drivers/infiniband/hw/bnxt_re/Kconfig
+++ b/drivers/infiniband/hw/bnxt_re/Kconfig
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_BNXT_RE
-        tristate "Broadcom Netxtreme HCA support"
-        depends on 64BIT
-        depends on ETHERNET && NETDEVICES && PCI && INET && DCB
-        select NET_VENDOR_BROADCOM
-        select BNXT
-        ---help---
+	tristate "Broadcom Netxtreme HCA support"
+	depends on 64BIT
+	depends on ETHERNET && NETDEVICES && PCI && INET && DCB
+	select NET_VENDOR_BROADCOM
+	select BNXT
+	help
 	  This driver supports Broadcom NetXtreme-E 10/25/40/50 gigabit
 	  RoCE HCAs.  To compile this driver as a module, choose M here:
 	  the module will be called bnxt_re.
diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
index e55a166..b930ea3 100644
--- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h
+++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h
@@ -89,6 +89,15 @@
 
 #define BNXT_RE_DEFAULT_ACK_DELAY	16
 
+struct bnxt_re_ring_attr {
+	dma_addr_t	*dma_arr;
+	int		pages;
+	int		type;
+	u32		depth;
+	u32		lrid; /* Logical ring id */
+	u8		mode;
+};
+
 struct bnxt_re_work {
 	struct work_struct	work;
 	unsigned long		event;
@@ -104,17 +113,25 @@
 	struct bnxt_re_qp *qp1_qp;
 };
 
+#define BNXT_RE_MAX_GSI_SQP_ENTRIES	1024
+struct bnxt_re_gsi_context {
+	struct	bnxt_re_qp *gsi_qp;
+	struct	bnxt_re_qp *gsi_sqp;
+	struct	bnxt_re_ah *gsi_sah;
+	struct	bnxt_re_sqp_entries *sqp_tbl;
+};
+
 #define BNXT_RE_MIN_MSIX		2
 #define BNXT_RE_MAX_MSIX		9
 #define BNXT_RE_AEQ_IDX			0
 #define BNXT_RE_NQ_IDX			1
+#define BNXT_RE_GEN_P5_MAX_VF		64
 
 struct bnxt_re_dev {
 	struct ib_device		ibdev;
 	struct list_head		list;
 	unsigned long			flags;
 #define BNXT_RE_FLAG_NETDEV_REGISTERED		0
-#define BNXT_RE_FLAG_IBDEV_REGISTERED		1
 #define BNXT_RE_FLAG_GOT_MSIX			2
 #define BNXT_RE_FLAG_HAVE_L2_REF		3
 #define BNXT_RE_FLAG_RCFW_CHANNEL_EN		4
@@ -124,7 +141,7 @@
 #define BNXT_RE_FLAG_ISSUE_ROCE_STATS          29
 	struct net_device		*netdev;
 	unsigned int			version, major, minor;
-	struct bnxt_qplib_chip_ctx	chip_ctx;
+	struct bnxt_qplib_chip_ctx	*chip_ctx;
 	struct bnxt_en_dev		*en_dev;
 	struct bnxt_msix_entry		msix_entries[BNXT_RE_MAX_MSIX];
 	int				num_msix;
@@ -133,7 +150,7 @@
 
 	struct delayed_work		worker;
 	u8				cur_prio_map;
-	u8				active_speed;
+	u16				active_speed;
 	u8				active_width;
 
 	/* FP Notification Queue (CQ & SRQ) */
@@ -159,15 +176,11 @@
 	atomic_t			srq_count;
 	atomic_t			mr_count;
 	atomic_t			mw_count;
-	atomic_t			sched_count;
 	/* Max of 2 lossless traffic class supported per port */
 	u16				cosq[2];
 
 	/* QP for for handling QP1 packets */
-	u32				sqp_id;
-	struct bnxt_re_qp		*qp1_sqp;
-	struct bnxt_re_ah		*sqp_ah;
-	struct bnxt_re_sqp_entries sqp_tbl[1024];
+	struct bnxt_re_gsi_context	gsi_ctx;
 	atomic_t nq_alloc_cnt;
 	u32 is_virtfn;
 	u32 num_vfs;
diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c
index 3421a0b..5f5408c 100644
--- a/drivers/infiniband/hw/bnxt_re/hw_counters.c
+++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c
@@ -132,7 +132,7 @@
 		stats->value[BNXT_RE_RECOVERABLE_ERRORS] =
 			le64_to_cpu(bnxt_re_stats->tx_bcast_pkts);
 		stats->value[BNXT_RE_RX_DROPS] =
-			le64_to_cpu(bnxt_re_stats->rx_drop_pkts);
+			le64_to_cpu(bnxt_re_stats->rx_error_pkts);
 		stats->value[BNXT_RE_RX_DISCARDS] =
 			le64_to_cpu(bnxt_re_stats->rx_discard_pkts);
 		stats->value[BNXT_RE_RX_PKTS] =
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index a96f914..10d77f5 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -177,9 +177,6 @@
 	ib_attr->max_total_mcast_qp_attach = 0;
 	ib_attr->max_ah = dev_attr->max_ah;
 
-	ib_attr->max_fmr = 0;
-	ib_attr->max_map_per_fmr = 0;
-
 	ib_attr->max_srq = dev_attr->max_srq;
 	ib_attr->max_srq_wr = dev_attr->max_srq_wqes;
 	ib_attr->max_srq_sge = dev_attr->max_srq_sges;
@@ -191,24 +188,6 @@
 	return 0;
 }
 
-int bnxt_re_modify_device(struct ib_device *ibdev,
-			  int device_modify_mask,
-			  struct ib_device_modify *device_modify)
-{
-	switch (device_modify_mask) {
-	case IB_DEVICE_MODIFY_SYS_IMAGE_GUID:
-		/* Modify the GUID requires the modification of the GID table */
-		/* GUID should be made as READ-ONLY */
-		break;
-	case IB_DEVICE_MODIFY_NODE_DESC:
-		/* Node Desc should be made as READ-ONLY */
-		break;
-	default:
-		break;
-	}
-	return 0;
-}
-
 /* Port */
 int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
 		       struct ib_port_attr *port_attr)
@@ -330,9 +309,9 @@
 		 */
 		if (ctx->idx == 0 &&
 		    rdma_link_local_addr((struct in6_addr *)gid_to_del) &&
-		    ctx->refcnt == 1 && rdev->qp1_sqp) {
-			dev_dbg(rdev_to_dev(rdev),
-				"Trying to delete GID0 while QP1 is alive\n");
+		    ctx->refcnt == 1 && rdev->gsi_ctx.gsi_sqp) {
+			ibdev_dbg(&rdev->ibdev,
+				  "Trying to delete GID0 while QP1 is alive\n");
 			return -EFAULT;
 		}
 		ctx->refcnt--;
@@ -340,8 +319,8 @@
 			rc = bnxt_qplib_del_sgid(sgid_tbl, gid_to_del,
 						 vlan_id,  true);
 			if (rc) {
-				dev_err(rdev_to_dev(rdev),
-					"Failed to remove GID: %#x", rc);
+				ibdev_err(&rdev->ibdev,
+					  "Failed to remove GID: %#x", rc);
 			} else {
 				ctx_tbl = sgid_tbl->ctx;
 				ctx_tbl[ctx->idx] = NULL;
@@ -378,7 +357,7 @@
 	}
 
 	if (rc < 0) {
-		dev_err(rdev_to_dev(rdev), "Failed to add GID: %#x", rc);
+		ibdev_err(&rdev->ibdev, "Failed to add GID: %#x", rc);
 		return rc;
 	}
 
@@ -441,12 +420,12 @@
 	wqe.bind.r_key = fence->bind_rkey;
 	fence->bind_rkey = ib_inc_rkey(fence->bind_rkey);
 
-	dev_dbg(rdev_to_dev(qp->rdev),
-		"Posting bind fence-WQE: rkey: %#x QP: %d PD: %p\n",
+	ibdev_dbg(&qp->rdev->ibdev,
+		  "Posting bind fence-WQE: rkey: %#x QP: %d PD: %p\n",
 		wqe.bind.r_key, qp->qplib_qp.id, pd);
 	rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe);
 	if (rc) {
-		dev_err(rdev_to_dev(qp->rdev), "Failed to bind fence-WQE\n");
+		ibdev_err(&qp->rdev->ibdev, "Failed to bind fence-WQE\n");
 		return rc;
 	}
 	bnxt_qplib_post_send_db(&qp->qplib_qp);
@@ -497,7 +476,7 @@
 				  DMA_BIDIRECTIONAL);
 	rc = dma_mapping_error(dev, dma_addr);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to dma-map fence-MR-mem\n");
+		ibdev_err(&rdev->ibdev, "Failed to dma-map fence-MR-mem\n");
 		rc = -EIO;
 		fence->dma_addr = 0;
 		goto fail;
@@ -517,7 +496,7 @@
 	mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags);
 	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to alloc fence-HW-MR\n");
+		ibdev_err(&rdev->ibdev, "Failed to alloc fence-HW-MR\n");
 		goto fail;
 	}
 
@@ -529,7 +508,7 @@
 	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl_tbl,
 			       BNXT_RE_FENCE_PBL_SIZE, false, PAGE_SIZE);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to register fence-MR\n");
+		ibdev_err(&rdev->ibdev, "Failed to register fence-MR\n");
 		goto fail;
 	}
 	mr->ib_mr.rkey = mr->qplib_mr.rkey;
@@ -537,8 +516,8 @@
 	/* Create a fence MW only for kernel consumers */
 	mw = bnxt_re_alloc_mw(&pd->ib_pd, IB_MW_TYPE_1, NULL);
 	if (IS_ERR(mw)) {
-		dev_err(rdev_to_dev(rdev),
-			"Failed to create fence-MW for PD: %p\n", pd);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to create fence-MW for PD: %p\n", pd);
 		rc = PTR_ERR(mw);
 		goto fail;
 	}
@@ -553,7 +532,7 @@
 }
 
 /* Protection Domains */
-void bnxt_re_dealloc_pd(struct ib_pd *ib_pd, struct ib_udata *udata)
+int bnxt_re_dealloc_pd(struct ib_pd *ib_pd, struct ib_udata *udata)
 {
 	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
 	struct bnxt_re_dev *rdev = pd->rdev;
@@ -563,6 +542,7 @@
 	if (pd->qplib_pd.id)
 		bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
 				      &pd->qplib_pd);
+	return 0;
 }
 
 int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
@@ -576,7 +556,7 @@
 
 	pd->rdev = rdev;
 	if (bnxt_qplib_alloc_pd(&rdev->qplib_res.pd_tbl, &pd->qplib_pd)) {
-		dev_err(rdev_to_dev(rdev), "Failed to allocate HW PD");
+		ibdev_err(&rdev->ibdev, "Failed to allocate HW PD");
 		rc = -ENOMEM;
 		goto fail;
 	}
@@ -603,16 +583,16 @@
 
 		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
 		if (rc) {
-			dev_err(rdev_to_dev(rdev),
-				"Failed to copy user response\n");
+			ibdev_err(&rdev->ibdev,
+				  "Failed to copy user response\n");
 			goto dbfail;
 		}
 	}
 
 	if (!udata)
 		if (bnxt_re_create_fence_mr(pd))
-			dev_warn(rdev_to_dev(rdev),
-				 "Failed to create Fence-MR\n");
+			ibdev_warn(&rdev->ibdev,
+				   "Failed to create Fence-MR\n");
 	return 0;
 dbfail:
 	bnxt_qplib_dealloc_pd(&rdev->qplib_res, &rdev->qplib_res.pd_tbl,
@@ -622,13 +602,14 @@
 }
 
 /* Address Handles */
-void bnxt_re_destroy_ah(struct ib_ah *ib_ah, u32 flags)
+int bnxt_re_destroy_ah(struct ib_ah *ib_ah, u32 flags)
 {
 	struct bnxt_re_ah *ah = container_of(ib_ah, struct bnxt_re_ah, ib_ah);
 	struct bnxt_re_dev *rdev = ah->rdev;
 
 	bnxt_qplib_destroy_ah(&rdev->qplib_res, &ah->qplib_ah,
 			      !(flags & RDMA_DESTROY_AH_SLEEPABLE));
+	return 0;
 }
 
 static u8 bnxt_re_stack_to_dev_nw_type(enum rdma_network_type ntype)
@@ -649,20 +630,22 @@
 	return nw_type;
 }
 
-int bnxt_re_create_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr,
-		      u32 flags, struct ib_udata *udata)
+int bnxt_re_create_ah(struct ib_ah *ib_ah, struct rdma_ah_init_attr *init_attr,
+		      struct ib_udata *udata)
 {
 	struct ib_pd *ib_pd = ib_ah->pd;
 	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	struct rdma_ah_attr *ah_attr = init_attr->ah_attr;
 	const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
 	struct bnxt_re_dev *rdev = pd->rdev;
 	const struct ib_gid_attr *sgid_attr;
+	struct bnxt_re_gid_ctx *ctx;
 	struct bnxt_re_ah *ah = container_of(ib_ah, struct bnxt_re_ah, ib_ah);
 	u8 nw_type;
 	int rc;
 
 	if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) {
-		dev_err(rdev_to_dev(rdev), "Failed to alloc AH: GRH not set");
+		ibdev_err(&rdev->ibdev, "Failed to alloc AH: GRH not set");
 		return -EINVAL;
 	}
 
@@ -672,28 +655,28 @@
 	/* Supply the configuration for the HW */
 	memcpy(ah->qplib_ah.dgid.data, grh->dgid.raw,
 	       sizeof(union ib_gid));
-	/*
-	 * If RoCE V2 is enabled, stack will have two entries for
-	 * each GID entry. Avoiding this duplicte entry in HW. Dividing
-	 * the GID index by 2 for RoCE V2
+	sgid_attr = grh->sgid_attr;
+	/* Get the HW context of the GID. The reference
+	 * of GID table entry is already taken by the caller.
 	 */
-	ah->qplib_ah.sgid_index = grh->sgid_index / 2;
+	ctx = rdma_read_gid_hw_context(sgid_attr);
+	ah->qplib_ah.sgid_index = ctx->idx;
 	ah->qplib_ah.host_sgid_index = grh->sgid_index;
 	ah->qplib_ah.traffic_class = grh->traffic_class;
 	ah->qplib_ah.flow_label = grh->flow_label;
 	ah->qplib_ah.hop_limit = grh->hop_limit;
 	ah->qplib_ah.sl = rdma_ah_get_sl(ah_attr);
 
-	sgid_attr = grh->sgid_attr;
 	/* Get network header type for this GID */
 	nw_type = rdma_gid_attr_network_type(sgid_attr);
 	ah->qplib_ah.nw_type = bnxt_re_stack_to_dev_nw_type(nw_type);
 
 	memcpy(ah->qplib_ah.dmac, ah_attr->roce.dmac, ETH_ALEN);
 	rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah,
-				  !(flags & RDMA_CREATE_AH_SLEEPABLE));
+				  !(init_attr->flags &
+				    RDMA_CREATE_AH_SLEEPABLE));
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to allocate HW AH");
+		ibdev_err(&rdev->ibdev, "Failed to allocate HW AH");
 		return rc;
 	}
 
@@ -760,6 +743,49 @@
 	spin_unlock_irqrestore(&qp->scq->cq_lock, flags);
 }
 
+static int bnxt_re_destroy_gsi_sqp(struct bnxt_re_qp *qp)
+{
+	struct bnxt_re_qp *gsi_sqp;
+	struct bnxt_re_ah *gsi_sah;
+	struct bnxt_re_dev *rdev;
+	int rc = 0;
+
+	rdev = qp->rdev;
+	gsi_sqp = rdev->gsi_ctx.gsi_sqp;
+	gsi_sah = rdev->gsi_ctx.gsi_sah;
+
+	ibdev_dbg(&rdev->ibdev, "Destroy the shadow AH\n");
+	bnxt_qplib_destroy_ah(&rdev->qplib_res,
+			      &gsi_sah->qplib_ah,
+			      true);
+	bnxt_qplib_clean_qp(&qp->qplib_qp);
+
+	ibdev_dbg(&rdev->ibdev, "Destroy the shadow QP\n");
+	rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &gsi_sqp->qplib_qp);
+	if (rc) {
+		ibdev_err(&rdev->ibdev, "Destroy Shadow QP failed");
+		goto fail;
+	}
+	bnxt_qplib_free_qp_res(&rdev->qplib_res, &gsi_sqp->qplib_qp);
+
+	/* remove from active qp list */
+	mutex_lock(&rdev->qp_lock);
+	list_del(&gsi_sqp->list);
+	mutex_unlock(&rdev->qp_lock);
+	atomic_dec(&rdev->qp_count);
+
+	kfree(rdev->gsi_ctx.sqp_tbl);
+	kfree(gsi_sah);
+	kfree(gsi_sqp);
+	rdev->gsi_ctx.gsi_sqp = NULL;
+	rdev->gsi_ctx.gsi_sah = NULL;
+	rdev->gsi_ctx.sqp_tbl = NULL;
+
+	return 0;
+fail:
+	return rc;
+}
+
 /* Queue Pairs */
 int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 {
@@ -769,9 +795,10 @@
 	int rc;
 
 	bnxt_qplib_flush_cqn_wq(&qp->qplib_qp);
+
 	rc = bnxt_qplib_destroy_qp(&rdev->qplib_res, &qp->qplib_qp);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to destroy HW QP");
+		ibdev_err(&rdev->ibdev, "Failed to destroy HW QP");
 		return rc;
 	}
 
@@ -783,40 +810,24 @@
 
 	bnxt_qplib_free_qp_res(&rdev->qplib_res, &qp->qplib_qp);
 
-	if (ib_qp->qp_type == IB_QPT_GSI && rdev->qp1_sqp) {
-		bnxt_qplib_destroy_ah(&rdev->qplib_res, &rdev->sqp_ah->qplib_ah,
-				      false);
-
-		bnxt_qplib_clean_qp(&qp->qplib_qp);
-		rc = bnxt_qplib_destroy_qp(&rdev->qplib_res,
-					   &rdev->qp1_sqp->qplib_qp);
-		if (rc) {
-			dev_err(rdev_to_dev(rdev),
-				"Failed to destroy Shadow QP");
-			return rc;
-		}
-		bnxt_qplib_free_qp_res(&rdev->qplib_res,
-				       &rdev->qp1_sqp->qplib_qp);
-		mutex_lock(&rdev->qp_lock);
-		list_del(&rdev->qp1_sqp->list);
-		atomic_dec(&rdev->qp_count);
-		mutex_unlock(&rdev->qp_lock);
-
-		kfree(rdev->sqp_ah);
-		kfree(rdev->qp1_sqp);
-		rdev->qp1_sqp = NULL;
-		rdev->sqp_ah = NULL;
+	if (ib_qp->qp_type == IB_QPT_GSI && rdev->gsi_ctx.gsi_sqp) {
+		rc = bnxt_re_destroy_gsi_sqp(qp);
+		if (rc)
+			goto sh_fail;
 	}
 
+	mutex_lock(&rdev->qp_lock);
+	list_del(&qp->list);
+	mutex_unlock(&rdev->qp_lock);
+	atomic_dec(&rdev->qp_count);
+
 	ib_umem_release(qp->rumem);
 	ib_umem_release(qp->sumem);
 
-	mutex_lock(&rdev->qp_lock);
-	list_del(&qp->list);
-	atomic_dec(&rdev->qp_count);
-	mutex_unlock(&rdev->qp_lock);
 	kfree(qp);
 	return 0;
+sh_fail:
+	return rc;
 }
 
 static u8 __from_ib_qp_type(enum ib_qp_type type)
@@ -833,49 +844,118 @@
 	}
 }
 
+static u16 bnxt_re_setup_rwqe_size(struct bnxt_qplib_qp *qplqp,
+				   int rsge, int max)
+{
+	if (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC)
+		rsge = max;
+	return bnxt_re_get_rwqe_size(rsge);
+}
+
+static u16 bnxt_re_get_wqe_size(int ilsize, int nsge)
+{
+	u16 wqe_size, calc_ils;
+
+	wqe_size = bnxt_re_get_swqe_size(nsge);
+	if (ilsize) {
+		calc_ils = sizeof(struct sq_send_hdr) + ilsize;
+		wqe_size = max_t(u16, calc_ils, wqe_size);
+		wqe_size = ALIGN(wqe_size, sizeof(struct sq_send_hdr));
+	}
+	return wqe_size;
+}
+
+static int bnxt_re_setup_swqe_size(struct bnxt_re_qp *qp,
+				   struct ib_qp_init_attr *init_attr)
+{
+	struct bnxt_qplib_dev_attr *dev_attr;
+	struct bnxt_qplib_qp *qplqp;
+	struct bnxt_re_dev *rdev;
+	struct bnxt_qplib_q *sq;
+	int align, ilsize;
+
+	rdev = qp->rdev;
+	qplqp = &qp->qplib_qp;
+	sq = &qplqp->sq;
+	dev_attr = &rdev->dev_attr;
+
+	align = sizeof(struct sq_send_hdr);
+	ilsize = ALIGN(init_attr->cap.max_inline_data, align);
+
+	sq->wqe_size = bnxt_re_get_wqe_size(ilsize, sq->max_sge);
+	if (sq->wqe_size > bnxt_re_get_swqe_size(dev_attr->max_qp_sges))
+		return -EINVAL;
+	/* For gen p4 and gen p5 backward compatibility mode
+	 * wqe size is fixed to 128 bytes
+	 */
+	if (sq->wqe_size < bnxt_re_get_swqe_size(dev_attr->max_qp_sges) &&
+			qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC)
+		sq->wqe_size = bnxt_re_get_swqe_size(dev_attr->max_qp_sges);
+
+	if (init_attr->cap.max_inline_data) {
+		qplqp->max_inline_data = sq->wqe_size -
+			sizeof(struct sq_send_hdr);
+		init_attr->cap.max_inline_data = qplqp->max_inline_data;
+		if (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC)
+			sq->max_sge = qplqp->max_inline_data /
+				sizeof(struct sq_sge);
+	}
+
+	return 0;
+}
+
 static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
 				struct bnxt_re_qp *qp, struct ib_udata *udata)
 {
+	struct bnxt_qplib_qp *qplib_qp;
+	struct bnxt_re_ucontext *cntx;
 	struct bnxt_re_qp_req ureq;
-	struct bnxt_qplib_qp *qplib_qp = &qp->qplib_qp;
-	struct ib_umem *umem;
 	int bytes = 0, psn_sz;
-	struct bnxt_re_ucontext *cntx = rdma_udata_to_drv_context(
-		udata, struct bnxt_re_ucontext, ib_uctx);
+	struct ib_umem *umem;
+	int psn_nume;
 
+	qplib_qp = &qp->qplib_qp;
+	cntx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext,
+					 ib_uctx);
 	if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
 		return -EFAULT;
 
-	bytes = (qplib_qp->sq.max_wqe * BNXT_QPLIB_MAX_SQE_ENTRY_SIZE);
+	bytes = (qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size);
 	/* Consider mapping PSN search memory only for RC QPs. */
 	if (qplib_qp->type == CMDQ_CREATE_QP_TYPE_RC) {
-		psn_sz = bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx) ?
-					sizeof(struct sq_psn_search_ext) :
-					sizeof(struct sq_psn_search);
-		bytes += (qplib_qp->sq.max_wqe * psn_sz);
+		psn_sz = bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) ?
+						   sizeof(struct sq_psn_search_ext) :
+						   sizeof(struct sq_psn_search);
+		psn_nume = (qplib_qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ?
+			    qplib_qp->sq.max_wqe :
+			    ((qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size) /
+			      sizeof(struct bnxt_qplib_sge));
+		bytes += (psn_nume * psn_sz);
 	}
+
 	bytes = PAGE_ALIGN(bytes);
-	umem = ib_umem_get(udata, ureq.qpsva, bytes, IB_ACCESS_LOCAL_WRITE, 1);
+	umem = ib_umem_get(&rdev->ibdev, ureq.qpsva, bytes,
+			   IB_ACCESS_LOCAL_WRITE);
 	if (IS_ERR(umem))
 		return PTR_ERR(umem);
 
 	qp->sumem = umem;
-	qplib_qp->sq.sg_info.sglist = umem->sg_head.sgl;
-	qplib_qp->sq.sg_info.npages = ib_umem_num_pages(umem);
-	qplib_qp->sq.sg_info.nmap = umem->nmap;
+	qplib_qp->sq.sg_info.umem = umem;
+	qplib_qp->sq.sg_info.pgsize = PAGE_SIZE;
+	qplib_qp->sq.sg_info.pgshft = PAGE_SHIFT;
 	qplib_qp->qp_handle = ureq.qp_handle;
 
 	if (!qp->qplib_qp.srq) {
-		bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
+		bytes = (qplib_qp->rq.max_wqe * qplib_qp->rq.wqe_size);
 		bytes = PAGE_ALIGN(bytes);
-		umem = ib_umem_get(udata, ureq.qprva, bytes,
-				   IB_ACCESS_LOCAL_WRITE, 1);
+		umem = ib_umem_get(&rdev->ibdev, ureq.qprva, bytes,
+				   IB_ACCESS_LOCAL_WRITE);
 		if (IS_ERR(umem))
 			goto rqfail;
 		qp->rumem = umem;
-		qplib_qp->rq.sg_info.sglist = umem->sg_head.sgl;
-		qplib_qp->rq.sg_info.npages = ib_umem_num_pages(umem);
-		qplib_qp->rq.sg_info.nmap = umem->nmap;
+		qplib_qp->rq.sg_info.umem = umem;
+		qplib_qp->rq.sg_info.pgsize = PAGE_SIZE;
+		qplib_qp->rq.sg_info.pgshft = PAGE_SHIFT;
 	}
 
 	qplib_qp->dpi = &cntx->dpi;
@@ -923,8 +1003,8 @@
 
 	rc = bnxt_qplib_create_ah(&rdev->qplib_res, &ah->qplib_ah, false);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev),
-			"Failed to allocate HW AH for Shadow QP");
+		ibdev_err(&rdev->ibdev,
+			  "Failed to allocate HW AH for Shadow QP");
 		goto fail;
 	}
 
@@ -961,18 +1041,24 @@
 	qp->qplib_qp.sig_type = true;
 
 	/* Shadow QP SQ depth should be same as QP1 RQ depth */
+	qp->qplib_qp.sq.wqe_size = bnxt_re_get_wqe_size(0, 6);
 	qp->qplib_qp.sq.max_wqe = qp1_qp->rq.max_wqe;
 	qp->qplib_qp.sq.max_sge = 2;
 	/* Q full delta can be 1 since it is internal QP */
 	qp->qplib_qp.sq.q_full_delta = 1;
+	qp->qplib_qp.sq.sg_info.pgsize = PAGE_SIZE;
+	qp->qplib_qp.sq.sg_info.pgshft = PAGE_SHIFT;
 
 	qp->qplib_qp.scq = qp1_qp->scq;
 	qp->qplib_qp.rcq = qp1_qp->rcq;
 
+	qp->qplib_qp.rq.wqe_size = bnxt_re_get_rwqe_size(6);
 	qp->qplib_qp.rq.max_wqe = qp1_qp->rq.max_wqe;
 	qp->qplib_qp.rq.max_sge = qp1_qp->rq.max_sge;
 	/* Q full delta can be 1 since it is internal QP */
 	qp->qplib_qp.rq.q_full_delta = 1;
+	qp->qplib_qp.rq.sg_info.pgsize = PAGE_SIZE;
+	qp->qplib_qp.rq.sg_info.pgshft = PAGE_SHIFT;
 
 	qp->qplib_qp.mtu = qp1_qp->mtu;
 
@@ -984,8 +1070,6 @@
 	if (rc)
 		goto fail;
 
-	rdev->sqp_id = qp->qplib_qp.id;
-
 	spin_lock_init(&qp->sq_lock);
 	INIT_LIST_HEAD(&qp->list);
 	mutex_lock(&rdev->qp_lock);
@@ -998,6 +1082,336 @@
 	return NULL;
 }
 
+static int bnxt_re_init_rq_attr(struct bnxt_re_qp *qp,
+				struct ib_qp_init_attr *init_attr)
+{
+	struct bnxt_qplib_dev_attr *dev_attr;
+	struct bnxt_qplib_qp *qplqp;
+	struct bnxt_re_dev *rdev;
+	struct bnxt_qplib_q *rq;
+	int entries;
+
+	rdev = qp->rdev;
+	qplqp = &qp->qplib_qp;
+	rq = &qplqp->rq;
+	dev_attr = &rdev->dev_attr;
+
+	if (init_attr->srq) {
+		struct bnxt_re_srq *srq;
+
+		srq = container_of(init_attr->srq, struct bnxt_re_srq, ib_srq);
+		if (!srq) {
+			ibdev_err(&rdev->ibdev, "SRQ not found");
+			return -EINVAL;
+		}
+		qplqp->srq = &srq->qplib_srq;
+		rq->max_wqe = 0;
+	} else {
+		rq->max_sge = init_attr->cap.max_recv_sge;
+		if (rq->max_sge > dev_attr->max_qp_sges)
+			rq->max_sge = dev_attr->max_qp_sges;
+		init_attr->cap.max_recv_sge = rq->max_sge;
+		rq->wqe_size = bnxt_re_setup_rwqe_size(qplqp, rq->max_sge,
+						       dev_attr->max_qp_sges);
+		/* Allocate 1 more than what's provided so posting max doesn't
+		 * mean empty.
+		 */
+		entries = roundup_pow_of_two(init_attr->cap.max_recv_wr + 1);
+		rq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + 1);
+		rq->q_full_delta = 0;
+		rq->sg_info.pgsize = PAGE_SIZE;
+		rq->sg_info.pgshft = PAGE_SHIFT;
+	}
+
+	return 0;
+}
+
+static void bnxt_re_adjust_gsi_rq_attr(struct bnxt_re_qp *qp)
+{
+	struct bnxt_qplib_dev_attr *dev_attr;
+	struct bnxt_qplib_qp *qplqp;
+	struct bnxt_re_dev *rdev;
+
+	rdev = qp->rdev;
+	qplqp = &qp->qplib_qp;
+	dev_attr = &rdev->dev_attr;
+
+	if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) {
+		qplqp->rq.max_sge = dev_attr->max_qp_sges;
+		if (qplqp->rq.max_sge > dev_attr->max_qp_sges)
+			qplqp->rq.max_sge = dev_attr->max_qp_sges;
+		qplqp->rq.max_sge = 6;
+	}
+}
+
+static int bnxt_re_init_sq_attr(struct bnxt_re_qp *qp,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct bnxt_qplib_dev_attr *dev_attr;
+	struct bnxt_qplib_qp *qplqp;
+	struct bnxt_re_dev *rdev;
+	struct bnxt_qplib_q *sq;
+	int entries;
+	int diff;
+	int rc;
+
+	rdev = qp->rdev;
+	qplqp = &qp->qplib_qp;
+	sq = &qplqp->sq;
+	dev_attr = &rdev->dev_attr;
+
+	sq->max_sge = init_attr->cap.max_send_sge;
+	if (sq->max_sge > dev_attr->max_qp_sges) {
+		sq->max_sge = dev_attr->max_qp_sges;
+		init_attr->cap.max_send_sge = sq->max_sge;
+	}
+
+	rc = bnxt_re_setup_swqe_size(qp, init_attr);
+	if (rc)
+		return rc;
+
+	entries = init_attr->cap.max_send_wr;
+	/* Allocate 128 + 1 more than what's provided */
+	diff = (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) ?
+		0 : BNXT_QPLIB_RESERVED_QP_WRS;
+	entries = roundup_pow_of_two(entries + diff + 1);
+	sq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + diff + 1);
+	sq->q_full_delta = diff + 1;
+	/*
+	 * Reserving one slot for Phantom WQE. Application can
+	 * post one extra entry in this case. But allowing this to avoid
+	 * unexpected Queue full condition
+	 */
+	qplqp->sq.q_full_delta -= 1;
+	qplqp->sq.sg_info.pgsize = PAGE_SIZE;
+	qplqp->sq.sg_info.pgshft = PAGE_SHIFT;
+
+	return 0;
+}
+
+static void bnxt_re_adjust_gsi_sq_attr(struct bnxt_re_qp *qp,
+				       struct ib_qp_init_attr *init_attr)
+{
+	struct bnxt_qplib_dev_attr *dev_attr;
+	struct bnxt_qplib_qp *qplqp;
+	struct bnxt_re_dev *rdev;
+	int entries;
+
+	rdev = qp->rdev;
+	qplqp = &qp->qplib_qp;
+	dev_attr = &rdev->dev_attr;
+
+	if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) {
+		entries = roundup_pow_of_two(init_attr->cap.max_send_wr + 1);
+		qplqp->sq.max_wqe = min_t(u32, entries,
+					  dev_attr->max_qp_wqes + 1);
+		qplqp->sq.q_full_delta = qplqp->sq.max_wqe -
+			init_attr->cap.max_send_wr;
+		qplqp->sq.max_sge++; /* Need one extra sge to put UD header */
+		if (qplqp->sq.max_sge > dev_attr->max_qp_sges)
+			qplqp->sq.max_sge = dev_attr->max_qp_sges;
+	}
+}
+
+static int bnxt_re_init_qp_type(struct bnxt_re_dev *rdev,
+				struct ib_qp_init_attr *init_attr)
+{
+	struct bnxt_qplib_chip_ctx *chip_ctx;
+	int qptype;
+
+	chip_ctx = rdev->chip_ctx;
+
+	qptype = __from_ib_qp_type(init_attr->qp_type);
+	if (qptype == IB_QPT_MAX) {
+		ibdev_err(&rdev->ibdev, "QP type 0x%x not supported", qptype);
+		qptype = -EOPNOTSUPP;
+		goto out;
+	}
+
+	if (bnxt_qplib_is_chip_gen_p5(chip_ctx) &&
+	    init_attr->qp_type == IB_QPT_GSI)
+		qptype = CMDQ_CREATE_QP_TYPE_GSI;
+out:
+	return qptype;
+}
+
+static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct bnxt_qplib_dev_attr *dev_attr;
+	struct bnxt_qplib_qp *qplqp;
+	struct bnxt_re_dev *rdev;
+	struct bnxt_re_cq *cq;
+	int rc = 0, qptype;
+
+	rdev = qp->rdev;
+	qplqp = &qp->qplib_qp;
+	dev_attr = &rdev->dev_attr;
+
+	/* Setup misc params */
+	ether_addr_copy(qplqp->smac, rdev->netdev->dev_addr);
+	qplqp->pd = &pd->qplib_pd;
+	qplqp->qp_handle = (u64)qplqp;
+	qplqp->max_inline_data = init_attr->cap.max_inline_data;
+	qplqp->sig_type = ((init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) ?
+			    true : false);
+	qptype = bnxt_re_init_qp_type(rdev, init_attr);
+	if (qptype < 0) {
+		rc = qptype;
+		goto out;
+	}
+	qplqp->type = (u8)qptype;
+	qplqp->wqe_mode = rdev->chip_ctx->modes.wqe_mode;
+
+	if (init_attr->qp_type == IB_QPT_RC) {
+		qplqp->max_rd_atomic = dev_attr->max_qp_rd_atom;
+		qplqp->max_dest_rd_atomic = dev_attr->max_qp_init_rd_atom;
+	}
+	qplqp->mtu = ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu));
+	qplqp->dpi = &rdev->dpi_privileged; /* Doorbell page */
+	if (init_attr->create_flags)
+		ibdev_dbg(&rdev->ibdev,
+			  "QP create flags 0x%x not supported",
+			  init_attr->create_flags);
+
+	/* Setup CQs */
+	if (init_attr->send_cq) {
+		cq = container_of(init_attr->send_cq, struct bnxt_re_cq, ib_cq);
+		if (!cq) {
+			ibdev_err(&rdev->ibdev, "Send CQ not found");
+			rc = -EINVAL;
+			goto out;
+		}
+		qplqp->scq = &cq->qplib_cq;
+		qp->scq = cq;
+	}
+
+	if (init_attr->recv_cq) {
+		cq = container_of(init_attr->recv_cq, struct bnxt_re_cq, ib_cq);
+		if (!cq) {
+			ibdev_err(&rdev->ibdev, "Receive CQ not found");
+			rc = -EINVAL;
+			goto out;
+		}
+		qplqp->rcq = &cq->qplib_cq;
+		qp->rcq = cq;
+	}
+
+	/* Setup RQ/SRQ */
+	rc = bnxt_re_init_rq_attr(qp, init_attr);
+	if (rc)
+		goto out;
+	if (init_attr->qp_type == IB_QPT_GSI)
+		bnxt_re_adjust_gsi_rq_attr(qp);
+
+	/* Setup SQ */
+	rc = bnxt_re_init_sq_attr(qp, init_attr, udata);
+	if (rc)
+		goto out;
+	if (init_attr->qp_type == IB_QPT_GSI)
+		bnxt_re_adjust_gsi_sq_attr(qp, init_attr);
+
+	if (udata) /* This will update DPI and qp_handle */
+		rc = bnxt_re_init_user_qp(rdev, pd, qp, udata);
+out:
+	return rc;
+}
+
+static int bnxt_re_create_shadow_gsi(struct bnxt_re_qp *qp,
+				     struct bnxt_re_pd *pd)
+{
+	struct bnxt_re_sqp_entries *sqp_tbl = NULL;
+	struct bnxt_re_dev *rdev;
+	struct bnxt_re_qp *sqp;
+	struct bnxt_re_ah *sah;
+	int rc = 0;
+
+	rdev = qp->rdev;
+	/* Create a shadow QP to handle the QP1 traffic */
+	sqp_tbl = kzalloc(sizeof(*sqp_tbl) * BNXT_RE_MAX_GSI_SQP_ENTRIES,
+			  GFP_KERNEL);
+	if (!sqp_tbl)
+		return -ENOMEM;
+	rdev->gsi_ctx.sqp_tbl = sqp_tbl;
+
+	sqp = bnxt_re_create_shadow_qp(pd, &rdev->qplib_res, &qp->qplib_qp);
+	if (!sqp) {
+		rc = -ENODEV;
+		ibdev_err(&rdev->ibdev, "Failed to create Shadow QP for QP1");
+		goto out;
+	}
+	rdev->gsi_ctx.gsi_sqp = sqp;
+
+	sqp->rcq = qp->rcq;
+	sqp->scq = qp->scq;
+	sah = bnxt_re_create_shadow_qp_ah(pd, &rdev->qplib_res,
+					  &qp->qplib_qp);
+	if (!sah) {
+		bnxt_qplib_destroy_qp(&rdev->qplib_res,
+				      &sqp->qplib_qp);
+		rc = -ENODEV;
+		ibdev_err(&rdev->ibdev,
+			  "Failed to create AH entry for ShadowQP");
+		goto out;
+	}
+	rdev->gsi_ctx.gsi_sah = sah;
+
+	return 0;
+out:
+	kfree(sqp_tbl);
+	return rc;
+}
+
+static int bnxt_re_create_gsi_qp(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd,
+				 struct ib_qp_init_attr *init_attr)
+{
+	struct bnxt_re_dev *rdev;
+	struct bnxt_qplib_qp *qplqp;
+	int rc = 0;
+
+	rdev = qp->rdev;
+	qplqp = &qp->qplib_qp;
+
+	qplqp->rq_hdr_buf_size = BNXT_QPLIB_MAX_QP1_RQ_HDR_SIZE_V2;
+	qplqp->sq_hdr_buf_size = BNXT_QPLIB_MAX_QP1_SQ_HDR_SIZE_V2;
+
+	rc = bnxt_qplib_create_qp1(&rdev->qplib_res, qplqp);
+	if (rc) {
+		ibdev_err(&rdev->ibdev, "create HW QP1 failed!");
+		goto out;
+	}
+
+	rc = bnxt_re_create_shadow_gsi(qp, pd);
+out:
+	return rc;
+}
+
+static bool bnxt_re_test_qp_limits(struct bnxt_re_dev *rdev,
+				   struct ib_qp_init_attr *init_attr,
+				   struct bnxt_qplib_dev_attr *dev_attr)
+{
+	bool rc = true;
+
+	if (init_attr->cap.max_send_wr > dev_attr->max_qp_wqes ||
+	    init_attr->cap.max_recv_wr > dev_attr->max_qp_wqes ||
+	    init_attr->cap.max_send_sge > dev_attr->max_qp_sges ||
+	    init_attr->cap.max_recv_sge > dev_attr->max_qp_sges ||
+	    init_attr->cap.max_inline_data > dev_attr->max_inline_data) {
+		ibdev_err(&rdev->ibdev,
+			  "Create QP failed - max exceeded! 0x%x/0x%x 0x%x/0x%x 0x%x/0x%x 0x%x/0x%x 0x%x/0x%x",
+			  init_attr->cap.max_send_wr, dev_attr->max_qp_wqes,
+			  init_attr->cap.max_recv_wr, dev_attr->max_qp_wqes,
+			  init_attr->cap.max_send_sge, dev_attr->max_qp_sges,
+			  init_attr->cap.max_recv_sge, dev_attr->max_qp_sges,
+			  init_attr->cap.max_inline_data,
+			  dev_attr->max_inline_data);
+		rc = false;
+	}
+	return rc;
+}
+
 struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd,
 				struct ib_qp_init_attr *qp_init_attr,
 				struct ib_udata *udata)
@@ -1006,197 +1420,60 @@
 	struct bnxt_re_dev *rdev = pd->rdev;
 	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
 	struct bnxt_re_qp *qp;
-	struct bnxt_re_cq *cq;
-	struct bnxt_re_srq *srq;
-	int rc, entries;
+	int rc;
 
-	if ((qp_init_attr->cap.max_send_wr > dev_attr->max_qp_wqes) ||
-	    (qp_init_attr->cap.max_recv_wr > dev_attr->max_qp_wqes) ||
-	    (qp_init_attr->cap.max_send_sge > dev_attr->max_qp_sges) ||
-	    (qp_init_attr->cap.max_recv_sge > dev_attr->max_qp_sges) ||
-	    (qp_init_attr->cap.max_inline_data > dev_attr->max_inline_data))
-		return ERR_PTR(-EINVAL);
+	rc = bnxt_re_test_qp_limits(rdev, qp_init_attr, dev_attr);
+	if (!rc) {
+		rc = -EINVAL;
+		goto exit;
+	}
 
 	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
-	if (!qp)
-		return ERR_PTR(-ENOMEM);
-
+	if (!qp) {
+		rc = -ENOMEM;
+		goto exit;
+	}
 	qp->rdev = rdev;
-	ether_addr_copy(qp->qplib_qp.smac, rdev->netdev->dev_addr);
-	qp->qplib_qp.pd = &pd->qplib_pd;
-	qp->qplib_qp.qp_handle = (u64)(unsigned long)(&qp->qplib_qp);
-	qp->qplib_qp.type = __from_ib_qp_type(qp_init_attr->qp_type);
-
-	if (qp_init_attr->qp_type == IB_QPT_GSI &&
-	    bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx))
-		qp->qplib_qp.type = CMDQ_CREATE_QP_TYPE_GSI;
-	if (qp->qplib_qp.type == IB_QPT_MAX) {
-		dev_err(rdev_to_dev(rdev), "QP type 0x%x not supported",
-			qp->qplib_qp.type);
-		rc = -EINVAL;
+	rc = bnxt_re_init_qp_attr(qp, pd, qp_init_attr, udata);
+	if (rc)
 		goto fail;
-	}
-
-	qp->qplib_qp.max_inline_data = qp_init_attr->cap.max_inline_data;
-	qp->qplib_qp.sig_type = ((qp_init_attr->sq_sig_type ==
-				  IB_SIGNAL_ALL_WR) ? true : false);
-
-	qp->qplib_qp.sq.max_sge = qp_init_attr->cap.max_send_sge;
-	if (qp->qplib_qp.sq.max_sge > dev_attr->max_qp_sges)
-		qp->qplib_qp.sq.max_sge = dev_attr->max_qp_sges;
-
-	if (qp_init_attr->send_cq) {
-		cq = container_of(qp_init_attr->send_cq, struct bnxt_re_cq,
-				  ib_cq);
-		if (!cq) {
-			dev_err(rdev_to_dev(rdev), "Send CQ not found");
-			rc = -EINVAL;
-			goto fail;
-		}
-		qp->qplib_qp.scq = &cq->qplib_cq;
-		qp->scq = cq;
-	}
-
-	if (qp_init_attr->recv_cq) {
-		cq = container_of(qp_init_attr->recv_cq, struct bnxt_re_cq,
-				  ib_cq);
-		if (!cq) {
-			dev_err(rdev_to_dev(rdev), "Receive CQ not found");
-			rc = -EINVAL;
-			goto fail;
-		}
-		qp->qplib_qp.rcq = &cq->qplib_cq;
-		qp->rcq = cq;
-	}
-
-	if (qp_init_attr->srq) {
-		srq = container_of(qp_init_attr->srq, struct bnxt_re_srq,
-				   ib_srq);
-		if (!srq) {
-			dev_err(rdev_to_dev(rdev), "SRQ not found");
-			rc = -EINVAL;
-			goto fail;
-		}
-		qp->qplib_qp.srq = &srq->qplib_srq;
-		qp->qplib_qp.rq.max_wqe = 0;
-	} else {
-		/* Allocate 1 more than what's provided so posting max doesn't
-		 * mean empty
-		 */
-		entries = roundup_pow_of_two(qp_init_attr->cap.max_recv_wr + 1);
-		qp->qplib_qp.rq.max_wqe = min_t(u32, entries,
-						dev_attr->max_qp_wqes + 1);
-
-		qp->qplib_qp.rq.q_full_delta = qp->qplib_qp.rq.max_wqe -
-						qp_init_attr->cap.max_recv_wr;
-
-		qp->qplib_qp.rq.max_sge = qp_init_attr->cap.max_recv_sge;
-		if (qp->qplib_qp.rq.max_sge > dev_attr->max_qp_sges)
-			qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
-	}
-
-	qp->qplib_qp.mtu = ib_mtu_enum_to_int(iboe_get_mtu(rdev->netdev->mtu));
 
 	if (qp_init_attr->qp_type == IB_QPT_GSI &&
-	    !(bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx))) {
-		/* Allocate 1 more than what's provided */
-		entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr + 1);
-		qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
-						dev_attr->max_qp_wqes + 1);
-		qp->qplib_qp.sq.q_full_delta = qp->qplib_qp.sq.max_wqe -
-						qp_init_attr->cap.max_send_wr;
-		qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
-		if (qp->qplib_qp.rq.max_sge > dev_attr->max_qp_sges)
-			qp->qplib_qp.rq.max_sge = dev_attr->max_qp_sges;
-		qp->qplib_qp.sq.max_sge++;
-		if (qp->qplib_qp.sq.max_sge > dev_attr->max_qp_sges)
-			qp->qplib_qp.sq.max_sge = dev_attr->max_qp_sges;
-
-		qp->qplib_qp.rq_hdr_buf_size =
-					BNXT_QPLIB_MAX_QP1_RQ_HDR_SIZE_V2;
-
-		qp->qplib_qp.sq_hdr_buf_size =
-					BNXT_QPLIB_MAX_QP1_SQ_HDR_SIZE_V2;
-		qp->qplib_qp.dpi = &rdev->dpi_privileged;
-		rc = bnxt_qplib_create_qp1(&rdev->qplib_res, &qp->qplib_qp);
-		if (rc) {
-			dev_err(rdev_to_dev(rdev), "Failed to create HW QP1");
+	    !(bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx))) {
+		rc = bnxt_re_create_gsi_qp(qp, pd, qp_init_attr);
+		if (rc == -ENODEV)
+			goto qp_destroy;
+		if (rc)
 			goto fail;
-		}
-		/* Create a shadow QP to handle the QP1 traffic */
-		rdev->qp1_sqp = bnxt_re_create_shadow_qp(pd, &rdev->qplib_res,
-							 &qp->qplib_qp);
-		if (!rdev->qp1_sqp) {
-			rc = -EINVAL;
-			dev_err(rdev_to_dev(rdev),
-				"Failed to create Shadow QP for QP1");
-			goto qp_destroy;
-		}
-		rdev->sqp_ah = bnxt_re_create_shadow_qp_ah(pd, &rdev->qplib_res,
-							   &qp->qplib_qp);
-		if (!rdev->sqp_ah) {
-			bnxt_qplib_destroy_qp(&rdev->qplib_res,
-					      &rdev->qp1_sqp->qplib_qp);
-			rc = -EINVAL;
-			dev_err(rdev_to_dev(rdev),
-				"Failed to create AH entry for ShadowQP");
-			goto qp_destroy;
-		}
-
 	} else {
-		/* Allocate 128 + 1 more than what's provided */
-		entries = roundup_pow_of_two(qp_init_attr->cap.max_send_wr +
-					     BNXT_QPLIB_RESERVED_QP_WRS + 1);
-		qp->qplib_qp.sq.max_wqe = min_t(u32, entries,
-						dev_attr->max_qp_wqes +
-						BNXT_QPLIB_RESERVED_QP_WRS + 1);
-		qp->qplib_qp.sq.q_full_delta = BNXT_QPLIB_RESERVED_QP_WRS + 1;
-
-		/*
-		 * Reserving one slot for Phantom WQE. Application can
-		 * post one extra entry in this case. But allowing this to avoid
-		 * unexpected Queue full condition
-		 */
-
-		qp->qplib_qp.sq.q_full_delta -= 1;
-
-		qp->qplib_qp.max_rd_atomic = dev_attr->max_qp_rd_atom;
-		qp->qplib_qp.max_dest_rd_atomic = dev_attr->max_qp_init_rd_atom;
-		if (udata) {
-			rc = bnxt_re_init_user_qp(rdev, pd, qp, udata);
-			if (rc)
-				goto fail;
-		} else {
-			qp->qplib_qp.dpi = &rdev->dpi_privileged;
-		}
-
 		rc = bnxt_qplib_create_qp(&rdev->qplib_res, &qp->qplib_qp);
 		if (rc) {
-			dev_err(rdev_to_dev(rdev), "Failed to create HW QP");
+			ibdev_err(&rdev->ibdev, "Failed to create HW QP");
 			goto free_umem;
 		}
+		if (udata) {
+			struct bnxt_re_qp_resp resp;
+
+			resp.qpid = qp->qplib_qp.id;
+			resp.rsvd = 0;
+			rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
+			if (rc) {
+				ibdev_err(&rdev->ibdev, "Failed to copy QP udata");
+				goto qp_destroy;
+			}
+		}
 	}
 
 	qp->ib_qp.qp_num = qp->qplib_qp.id;
+	if (qp_init_attr->qp_type == IB_QPT_GSI)
+		rdev->gsi_ctx.gsi_qp = qp;
 	spin_lock_init(&qp->sq_lock);
 	spin_lock_init(&qp->rq_lock);
-
-	if (udata) {
-		struct bnxt_re_qp_resp resp;
-
-		resp.qpid = qp->ib_qp.qp_num;
-		resp.rsvd = 0;
-		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
-		if (rc) {
-			dev_err(rdev_to_dev(rdev), "Failed to copy QP udata");
-			goto qp_destroy;
-		}
-	}
 	INIT_LIST_HEAD(&qp->list);
 	mutex_lock(&rdev->qp_lock);
 	list_add_tail(&qp->list, &rdev->qp_list);
-	atomic_inc(&rdev->qp_count);
 	mutex_unlock(&rdev->qp_lock);
+	atomic_inc(&rdev->qp_count);
 
 	return &qp->ib_qp;
 qp_destroy:
@@ -1206,6 +1483,7 @@
 	ib_umem_release(qp->sumem);
 fail:
 	kfree(qp);
+exit:
 	return ERR_PTR(rc);
 }
 
@@ -1288,7 +1566,7 @@
 }
 
 /* Shared Receive Queues */
-void bnxt_re_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata)
+int bnxt_re_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata)
 {
 	struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq,
 					       ib_srq);
@@ -1303,6 +1581,7 @@
 	atomic_dec(&rdev->srq_count);
 	if (nq)
 		nq->budget--;
+	return 0;
 }
 
 static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev,
@@ -1320,16 +1599,17 @@
 	if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
 		return -EFAULT;
 
-	bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
+	bytes = (qplib_srq->max_wqe * qplib_srq->wqe_size);
 	bytes = PAGE_ALIGN(bytes);
-	umem = ib_umem_get(udata, ureq.srqva, bytes, IB_ACCESS_LOCAL_WRITE, 1);
+	umem = ib_umem_get(&rdev->ibdev, ureq.srqva, bytes,
+			   IB_ACCESS_LOCAL_WRITE);
 	if (IS_ERR(umem))
 		return PTR_ERR(umem);
 
 	srq->umem = umem;
-	qplib_srq->sg_info.sglist = umem->sg_head.sgl;
-	qplib_srq->sg_info.npages = ib_umem_num_pages(umem);
-	qplib_srq->sg_info.nmap = umem->nmap;
+	qplib_srq->sg_info.umem = umem;
+	qplib_srq->sg_info.pgsize = PAGE_SIZE;
+	qplib_srq->sg_info.pgshft = PAGE_SHIFT;
 	qplib_srq->srq_handle = ureq.srq_handle;
 	qplib_srq->dpi = &cntx->dpi;
 
@@ -1340,17 +1620,22 @@
 		       struct ib_srq_init_attr *srq_init_attr,
 		       struct ib_udata *udata)
 {
-	struct ib_pd *ib_pd = ib_srq->pd;
-	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
-	struct bnxt_re_dev *rdev = pd->rdev;
-	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
-	struct bnxt_re_srq *srq =
-		container_of(ib_srq, struct bnxt_re_srq, ib_srq);
+	struct bnxt_qplib_dev_attr *dev_attr;
 	struct bnxt_qplib_nq *nq = NULL;
+	struct bnxt_re_dev *rdev;
+	struct bnxt_re_srq *srq;
+	struct bnxt_re_pd *pd;
+	struct ib_pd *ib_pd;
 	int rc, entries;
 
+	ib_pd = ib_srq->pd;
+	pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
+	rdev = pd->rdev;
+	dev_attr = &rdev->dev_attr;
+	srq = container_of(ib_srq, struct bnxt_re_srq, ib_srq);
+
 	if (srq_init_attr->attr.max_wr >= dev_attr->max_srq_wqes) {
-		dev_err(rdev_to_dev(rdev), "Create CQ failed - max exceeded");
+		ibdev_err(&rdev->ibdev, "Create CQ failed - max exceeded");
 		rc = -EINVAL;
 		goto exit;
 	}
@@ -1369,9 +1654,11 @@
 	entries = roundup_pow_of_two(srq_init_attr->attr.max_wr + 1);
 	if (entries > dev_attr->max_srq_wqes + 1)
 		entries = dev_attr->max_srq_wqes + 1;
-
 	srq->qplib_srq.max_wqe = entries;
+
 	srq->qplib_srq.max_sge = srq_init_attr->attr.max_sge;
+	 /* 128 byte wqe size for SRQ . So use max sges */
+	srq->qplib_srq.wqe_size = bnxt_re_get_rwqe_size(dev_attr->max_srq_sges);
 	srq->qplib_srq.threshold = srq_init_attr->attr.srq_limit;
 	srq->srq_limit = srq_init_attr->attr.srq_limit;
 	srq->qplib_srq.eventq_hw_ring_id = rdev->nq[0].ring_id;
@@ -1385,7 +1672,7 @@
 
 	rc = bnxt_qplib_create_srq(&rdev->qplib_res, &srq->qplib_srq);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Create HW SRQ failed!");
+		ibdev_err(&rdev->ibdev, "Create HW SRQ failed!");
 		goto fail;
 	}
 
@@ -1395,7 +1682,7 @@
 		resp.srqid = srq->qplib_srq.id;
 		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
 		if (rc) {
-			dev_err(rdev_to_dev(rdev), "SRQ copy to udata failed!");
+			ibdev_err(&rdev->ibdev, "SRQ copy to udata failed!");
 			bnxt_qplib_destroy_srq(&rdev->qplib_res,
 					       &srq->qplib_srq);
 			goto fail;
@@ -1435,7 +1722,7 @@
 		srq->qplib_srq.threshold = srq_attr->srq_limit;
 		rc = bnxt_qplib_modify_srq(&rdev->qplib_res, &srq->qplib_srq);
 		if (rc) {
-			dev_err(rdev_to_dev(rdev), "Modify HW SRQ failed!");
+			ibdev_err(&rdev->ibdev, "Modify HW SRQ failed!");
 			return rc;
 		}
 		/* On success, update the shadow */
@@ -1443,8 +1730,8 @@
 		/* No need to Build and send response back to udata */
 		break;
 	default:
-		dev_err(rdev_to_dev(rdev),
-			"Unsupported srq_attr_mask 0x%x", srq_attr_mask);
+		ibdev_err(&rdev->ibdev,
+			  "Unsupported srq_attr_mask 0x%x", srq_attr_mask);
 		return -EINVAL;
 	}
 	return 0;
@@ -1462,7 +1749,7 @@
 	tsrq.qplib_srq.id = srq->qplib_srq.id;
 	rc = bnxt_qplib_query_srq(&rdev->qplib_res, &tsrq.qplib_srq);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Query HW SRQ failed!");
+		ibdev_err(&rdev->ibdev, "Query HW SRQ failed!");
 		return rc;
 	}
 	srq_attr->max_wr = srq->qplib_srq.max_wqe;
@@ -1504,7 +1791,7 @@
 				    struct bnxt_re_qp *qp1_qp,
 				    int qp_attr_mask)
 {
-	struct bnxt_re_qp *qp = rdev->qp1_sqp;
+	struct bnxt_re_qp *qp = rdev->gsi_ctx.gsi_sqp;
 	int rc = 0;
 
 	if (qp_attr_mask & IB_QP_STATE) {
@@ -1528,8 +1815,7 @@
 
 	rc = bnxt_qplib_modify_qp(&rdev->qplib_res, &qp->qplib_qp);
 	if (rc)
-		dev_err(rdev_to_dev(rdev),
-			"Failed to modify Shadow QP for QP1");
+		ibdev_err(&rdev->ibdev, "Failed to modify Shadow QP for QP1");
 	return rc;
 }
 
@@ -1550,15 +1836,15 @@
 		new_qp_state = qp_attr->qp_state;
 		if (!ib_modify_qp_is_ok(curr_qp_state, new_qp_state,
 					ib_qp->qp_type, qp_attr_mask)) {
-			dev_err(rdev_to_dev(rdev),
-				"Invalid attribute mask: %#x specified ",
-				qp_attr_mask);
-			dev_err(rdev_to_dev(rdev),
-				"for qpn: %#x type: %#x",
-				ib_qp->qp_num, ib_qp->qp_type);
-			dev_err(rdev_to_dev(rdev),
-				"curr_qp_state=0x%x, new_qp_state=0x%x\n",
-				curr_qp_state, new_qp_state);
+			ibdev_err(&rdev->ibdev,
+				  "Invalid attribute mask: %#x specified ",
+				  qp_attr_mask);
+			ibdev_err(&rdev->ibdev,
+				  "for qpn: %#x type: %#x",
+				  ib_qp->qp_num, ib_qp->qp_type);
+			ibdev_err(&rdev->ibdev,
+				  "curr_qp_state=0x%x, new_qp_state=0x%x\n",
+				  curr_qp_state, new_qp_state);
 			return -EINVAL;
 		}
 		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_STATE;
@@ -1566,18 +1852,16 @@
 
 		if (!qp->sumem &&
 		    qp->qplib_qp.state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
-			dev_dbg(rdev_to_dev(rdev),
-				"Move QP = %p to flush list\n",
-				qp);
+			ibdev_dbg(&rdev->ibdev,
+				  "Move QP = %p to flush list\n", qp);
 			flags = bnxt_re_lock_cqs(qp);
 			bnxt_qplib_add_flush_qp(&qp->qplib_qp);
 			bnxt_re_unlock_cqs(qp, flags);
 		}
 		if (!qp->sumem &&
 		    qp->qplib_qp.state == CMDQ_MODIFY_QP_NEW_STATE_RESET) {
-			dev_dbg(rdev_to_dev(rdev),
-				"Move QP = %p out of flush list\n",
-				qp);
+			ibdev_dbg(&rdev->ibdev,
+				  "Move QP = %p out of flush list\n", qp);
 			flags = bnxt_re_lock_cqs(qp);
 			bnxt_qplib_clean_qp(&qp->qplib_qp);
 			bnxt_re_unlock_cqs(qp, flags);
@@ -1610,6 +1894,7 @@
 		const struct ib_global_route *grh =
 			rdma_ah_read_grh(&qp_attr->ah_attr);
 		const struct ib_gid_attr *sgid_attr;
+		struct bnxt_re_gid_ctx *ctx;
 
 		qp->qplib_qp.modify_flags |= CMDQ_MODIFY_QP_MODIFY_MASK_DGID |
 				     CMDQ_MODIFY_QP_MODIFY_MASK_FLOW_LABEL |
@@ -1621,11 +1906,12 @@
 		memcpy(qp->qplib_qp.ah.dgid.data, grh->dgid.raw,
 		       sizeof(qp->qplib_qp.ah.dgid.data));
 		qp->qplib_qp.ah.flow_label = grh->flow_label;
-		/* If RoCE V2 is enabled, stack will have two entries for
-		 * each GID entry. Avoiding this duplicte entry in HW. Dividing
-		 * the GID index by 2 for RoCE V2
+		sgid_attr = grh->sgid_attr;
+		/* Get the HW context of the GID. The reference
+		 * of GID table entry is already taken by the caller.
 		 */
-		qp->qplib_qp.ah.sgid_index = grh->sgid_index / 2;
+		ctx = rdma_read_gid_hw_context(sgid_attr);
+		qp->qplib_qp.ah.sgid_index = ctx->idx;
 		qp->qplib_qp.ah.host_sgid_index = grh->sgid_index;
 		qp->qplib_qp.ah.hop_limit = grh->hop_limit;
 		qp->qplib_qp.ah.traffic_class = grh->traffic_class;
@@ -1633,7 +1919,6 @@
 		ether_addr_copy(qp->qplib_qp.ah.dmac,
 				qp_attr->ah_attr.roce.dmac);
 
-		sgid_attr = qp_attr->ah_attr.grh.sgid_attr;
 		rc = rdma_read_gid_l2_fields(sgid_attr, NULL,
 					     &qp->qplib_qp.smac[0]);
 		if (rc)
@@ -1707,10 +1992,10 @@
 	if (qp_attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
 		if (qp_attr->max_dest_rd_atomic >
 		    dev_attr->max_qp_init_rd_atom) {
-			dev_err(rdev_to_dev(rdev),
-				"max_dest_rd_atomic requested%d is > dev_max%d",
-				qp_attr->max_dest_rd_atomic,
-				dev_attr->max_qp_init_rd_atom);
+			ibdev_err(&rdev->ibdev,
+				  "max_dest_rd_atomic requested%d is > dev_max%d",
+				  qp_attr->max_dest_rd_atomic,
+				  dev_attr->max_qp_init_rd_atom);
 			return -EINVAL;
 		}
 
@@ -1731,8 +2016,8 @@
 		    (qp_attr->cap.max_recv_sge >= dev_attr->max_qp_sges) ||
 		    (qp_attr->cap.max_inline_data >=
 						dev_attr->max_inline_data)) {
-			dev_err(rdev_to_dev(rdev),
-				"Create QP failed - max exceeded");
+			ibdev_err(&rdev->ibdev,
+				  "Create QP failed - max exceeded");
 			return -EINVAL;
 		}
 		entries = roundup_pow_of_two(qp_attr->cap.max_send_wr);
@@ -1765,10 +2050,10 @@
 	}
 	rc = bnxt_qplib_modify_qp(&rdev->qplib_res, &qp->qplib_qp);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to modify HW QP");
+		ibdev_err(&rdev->ibdev, "Failed to modify HW QP");
 		return rc;
 	}
-	if (ib_qp->qp_type == IB_QPT_GSI && rdev->qp1_sqp)
+	if (ib_qp->qp_type == IB_QPT_GSI && rdev->gsi_ctx.gsi_sqp)
 		rc = bnxt_re_modify_shadow_qp(rdev, qp, qp_attr_mask);
 	return rc;
 }
@@ -1790,7 +2075,7 @@
 
 	rc = bnxt_qplib_query_qp(&rdev->qplib_res, qplib_qp);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to query HW QP");
+		ibdev_err(&rdev->ibdev, "Failed to query HW QP");
 		goto out;
 	}
 	qp_attr->qp_state = __to_ib_qp_state(qplib_qp->state);
@@ -1996,7 +2281,7 @@
 		wqe->num_sge++;
 
 	} else {
-		dev_err(rdev_to_dev(qp->rdev), "QP1 buffer is empty!");
+		ibdev_err(&qp->rdev->ibdev, "QP1 buffer is empty!");
 		rc = -ENOMEM;
 	}
 	return rc;
@@ -2013,9 +2298,12 @@
 					    struct bnxt_qplib_swqe *wqe,
 					    int payload_size)
 {
-	struct bnxt_qplib_sge ref, sge;
-	u32 rq_prod_index;
 	struct bnxt_re_sqp_entries *sqp_entry;
+	struct bnxt_qplib_sge ref, sge;
+	struct bnxt_re_dev *rdev;
+	u32 rq_prod_index;
+
+	rdev = qp->rdev;
 
 	rq_prod_index = bnxt_qplib_get_rq_prod_index(&qp->qplib_qp);
 
@@ -2030,7 +2318,7 @@
 	ref.lkey = wqe->sg_list[0].lkey;
 	ref.size = wqe->sg_list[0].size;
 
-	sqp_entry = &qp->rdev->sqp_tbl[rq_prod_index];
+	sqp_entry = &rdev->gsi_ctx.sqp_tbl[rq_prod_index];
 
 	/* SGE 1 */
 	wqe->sg_list[0].addr = sge.addr;
@@ -2182,7 +2470,7 @@
 	wqe->frmr.pbl_dma_ptr = qplib_frpl->hwq.pbl_dma_ptr[0];
 	wqe->frmr.page_list = mr->pages;
 	wqe->frmr.page_list_len = mr->npages;
-	wqe->frmr.levels = qplib_frpl->hwq.level + 1;
+	wqe->frmr.levels = qplib_frpl->hwq.level;
 	wqe->type = BNXT_QPLIB_SWQE_TYPE_REG_MR;
 
 	/* Need unconditional fence for reg_mr
@@ -2229,8 +2517,8 @@
 
 		if ((sge_len + wqe->inline_len) >
 		    BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH) {
-			dev_err(rdev_to_dev(rdev),
-				"Inline data size requested > supported value");
+			ibdev_err(&rdev->ibdev,
+				  "Inline data size requested > supported value");
 			return -EINVAL;
 		}
 		sge_len = wr->sg_list[i].length;
@@ -2277,21 +2565,18 @@
 				       struct bnxt_re_qp *qp,
 				       const struct ib_send_wr *wr)
 {
-	struct bnxt_qplib_swqe wqe;
 	int rc = 0, payload_sz = 0;
 	unsigned long flags;
 
 	spin_lock_irqsave(&qp->sq_lock, flags);
-	memset(&wqe, 0, sizeof(wqe));
 	while (wr) {
-		/* House keeping */
-		memset(&wqe, 0, sizeof(wqe));
+		struct bnxt_qplib_swqe wqe = {};
 
 		/* Common */
 		wqe.num_sge = wr->num_sge;
 		if (wr->num_sge > qp->qplib_qp.sq.max_sge) {
-			dev_err(rdev_to_dev(rdev),
-				"Limit exceeded for Send SGEs");
+			ibdev_err(&rdev->ibdev,
+				  "Limit exceeded for Send SGEs");
 			rc = -EINVAL;
 			goto bad;
 		}
@@ -2310,9 +2595,9 @@
 			rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe);
 bad:
 		if (rc) {
-			dev_err(rdev_to_dev(rdev),
-				"Post send failed opcode = %#x rc = %d",
-				wr->opcode, rc);
+			ibdev_err(&rdev->ibdev,
+				  "Post send failed opcode = %#x rc = %d",
+				  wr->opcode, rc);
 			break;
 		}
 		wr = wr->next;
@@ -2339,8 +2624,8 @@
 		/* Common */
 		wqe.num_sge = wr->num_sge;
 		if (wr->num_sge > qp->qplib_qp.sq.max_sge) {
-			dev_err(rdev_to_dev(qp->rdev),
-				"Limit exceeded for Send SGEs");
+			ibdev_err(&qp->rdev->ibdev,
+				  "Limit exceeded for Send SGEs");
 			rc = -EINVAL;
 			goto bad;
 		}
@@ -2371,7 +2656,7 @@
 			default:
 				break;
 			}
-			/* fall through */
+			fallthrough;
 		case IB_WR_SEND_WITH_INV:
 			rc = bnxt_re_build_send_wqe(qp, wr, &wqe);
 			break;
@@ -2385,8 +2670,8 @@
 			rc = bnxt_re_build_atomic_wqe(wr, &wqe);
 			break;
 		case IB_WR_RDMA_READ_WITH_INV:
-			dev_err(rdev_to_dev(qp->rdev),
-				"RDMA Read with Invalidate is not supported");
+			ibdev_err(&qp->rdev->ibdev,
+				  "RDMA Read with Invalidate is not supported");
 			rc = -EINVAL;
 			goto bad;
 		case IB_WR_LOCAL_INV:
@@ -2397,8 +2682,8 @@
 			break;
 		default:
 			/* Unsupported WRs */
-			dev_err(rdev_to_dev(qp->rdev),
-				"WR (%#x) is not supported", wr->opcode);
+			ibdev_err(&qp->rdev->ibdev,
+				  "WR (%#x) is not supported", wr->opcode);
 			rc = -EINVAL;
 			goto bad;
 		}
@@ -2406,9 +2691,9 @@
 			rc = bnxt_qplib_post_send(&qp->qplib_qp, &wqe);
 bad:
 		if (rc) {
-			dev_err(rdev_to_dev(qp->rdev),
-				"post_send failed op:%#x qps = %#x rc = %d\n",
-				wr->opcode, qp->qplib_qp.state, rc);
+			ibdev_err(&qp->rdev->ibdev,
+				  "post_send failed op:%#x qps = %#x rc = %d\n",
+				  wr->opcode, qp->qplib_qp.state, rc);
 			*bad_wr = wr;
 			break;
 		}
@@ -2436,8 +2721,8 @@
 		/* Common */
 		wqe.num_sge = wr->num_sge;
 		if (wr->num_sge > qp->qplib_qp.rq.max_sge) {
-			dev_err(rdev_to_dev(rdev),
-				"Limit exceeded for Receive SGEs");
+			ibdev_err(&rdev->ibdev,
+				  "Limit exceeded for Receive SGEs");
 			rc = -EINVAL;
 			break;
 		}
@@ -2473,8 +2758,8 @@
 		/* Common */
 		wqe.num_sge = wr->num_sge;
 		if (wr->num_sge > qp->qplib_qp.rq.max_sge) {
-			dev_err(rdev_to_dev(qp->rdev),
-				"Limit exceeded for Receive SGEs");
+			ibdev_err(&qp->rdev->ibdev,
+				  "Limit exceeded for Receive SGEs");
 			rc = -EINVAL;
 			*bad_wr = wr;
 			break;
@@ -2514,7 +2799,7 @@
 }
 
 /* Completion Queues */
-void bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
 	struct bnxt_re_cq *cq;
 	struct bnxt_qplib_nq *nq;
@@ -2530,6 +2815,7 @@
 	atomic_dec(&rdev->cq_count);
 	nq->budget--;
 	kfree(cq->cql);
+	return 0;
 }
 
 int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
@@ -2545,7 +2831,7 @@
 
 	/* Validate CQ fields */
 	if (cqe < 1 || cqe > dev_attr->max_cq_wqes) {
-		dev_err(rdev_to_dev(rdev), "Failed to create CQ -max exceeded");
+		ibdev_err(&rdev->ibdev, "Failed to create CQ -max exceeded");
 		return -EINVAL;
 	}
 
@@ -2556,6 +2842,8 @@
 	if (entries > dev_attr->max_cq_wqes + 1)
 		entries = dev_attr->max_cq_wqes + 1;
 
+	cq->qplib_cq.sg_info.pgsize = PAGE_SIZE;
+	cq->qplib_cq.sg_info.pgshft = PAGE_SHIFT;
 	if (udata) {
 		struct bnxt_re_cq_req req;
 		struct bnxt_re_ucontext *uctx = rdma_udata_to_drv_context(
@@ -2565,16 +2853,14 @@
 			goto fail;
 		}
 
-		cq->umem = ib_umem_get(udata, req.cq_va,
+		cq->umem = ib_umem_get(&rdev->ibdev, req.cq_va,
 				       entries * sizeof(struct cq_base),
-				       IB_ACCESS_LOCAL_WRITE, 1);
+				       IB_ACCESS_LOCAL_WRITE);
 		if (IS_ERR(cq->umem)) {
 			rc = PTR_ERR(cq->umem);
 			goto fail;
 		}
-		cq->qplib_cq.sg_info.sglist = cq->umem->sg_head.sgl;
-		cq->qplib_cq.sg_info.npages = ib_umem_num_pages(cq->umem);
-		cq->qplib_cq.sg_info.nmap = cq->umem->nmap;
+		cq->qplib_cq.sg_info.umem = cq->umem;
 		cq->qplib_cq.dpi = &uctx->dpi;
 	} else {
 		cq->max_cql = min_t(u32, entries, MAX_CQL_PER_POLL);
@@ -2599,7 +2885,7 @@
 
 	rc = bnxt_qplib_create_cq(&rdev->qplib_res, &cq->qplib_cq);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to create HW CQ");
+		ibdev_err(&rdev->ibdev, "Failed to create HW CQ");
 		goto fail;
 	}
 
@@ -2619,7 +2905,7 @@
 		resp.rsvd = 0;
 		rc = ib_copy_to_udata(udata, &resp, sizeof(resp));
 		if (rc) {
-			dev_err(rdev_to_dev(rdev), "Failed to copy CQ udata");
+			ibdev_err(&rdev->ibdev, "Failed to copy CQ udata");
 			bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq);
 			goto c2fail;
 		}
@@ -2850,12 +3136,13 @@
 	return rc;
 }
 
-static int bnxt_re_process_raw_qp_pkt_rx(struct bnxt_re_qp *qp1_qp,
+static int bnxt_re_process_raw_qp_pkt_rx(struct bnxt_re_qp *gsi_qp,
 					 struct bnxt_qplib_cqe *cqe)
 {
-	struct bnxt_re_dev *rdev = qp1_qp->rdev;
+	struct bnxt_re_dev *rdev = gsi_qp->rdev;
 	struct bnxt_re_sqp_entries *sqp_entry = NULL;
-	struct bnxt_re_qp *qp = rdev->qp1_sqp;
+	struct bnxt_re_qp *gsi_sqp = rdev->gsi_ctx.gsi_sqp;
+	struct bnxt_re_ah *gsi_sah;
 	struct ib_send_wr *swr;
 	struct ib_ud_wr udwr;
 	struct ib_recv_wr rwr;
@@ -2878,26 +3165,26 @@
 	swr = &udwr.wr;
 	tbl_idx = cqe->wr_id;
 
-	rq_hdr_buf = qp1_qp->qplib_qp.rq_hdr_buf +
-			(tbl_idx * qp1_qp->qplib_qp.rq_hdr_buf_size);
-	rq_hdr_buf_map = bnxt_qplib_get_qp_buf_from_index(&qp1_qp->qplib_qp,
+	rq_hdr_buf = gsi_qp->qplib_qp.rq_hdr_buf +
+			(tbl_idx * gsi_qp->qplib_qp.rq_hdr_buf_size);
+	rq_hdr_buf_map = bnxt_qplib_get_qp_buf_from_index(&gsi_qp->qplib_qp,
 							  tbl_idx);
 
 	/* Shadow QP header buffer */
-	shrq_hdr_buf_map = bnxt_qplib_get_qp_buf_from_index(&qp->qplib_qp,
+	shrq_hdr_buf_map = bnxt_qplib_get_qp_buf_from_index(&gsi_qp->qplib_qp,
 							    tbl_idx);
-	sqp_entry = &rdev->sqp_tbl[tbl_idx];
+	sqp_entry = &rdev->gsi_ctx.sqp_tbl[tbl_idx];
 
 	/* Store this cqe */
 	memcpy(&sqp_entry->cqe, cqe, sizeof(struct bnxt_qplib_cqe));
-	sqp_entry->qp1_qp = qp1_qp;
+	sqp_entry->qp1_qp = gsi_qp;
 
 	/* Find packet type from the cqe */
 
 	pkt_type = bnxt_re_check_packet_type(cqe->raweth_qp1_flags,
 					     cqe->raweth_qp1_flags2);
 	if (pkt_type < 0) {
-		dev_err(rdev_to_dev(rdev), "Invalid packet\n");
+		ibdev_err(&rdev->ibdev, "Invalid packet\n");
 		return -EINVAL;
 	}
 
@@ -2944,10 +3231,10 @@
 	rwr.wr_id = tbl_idx;
 	rwr.next = NULL;
 
-	rc = bnxt_re_post_recv_shadow_qp(rdev, qp, &rwr);
+	rc = bnxt_re_post_recv_shadow_qp(rdev, gsi_sqp, &rwr);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev),
-			"Failed to post Rx buffers to shadow QP");
+		ibdev_err(&rdev->ibdev,
+			  "Failed to post Rx buffers to shadow QP");
 		return -ENOMEM;
 	}
 
@@ -2956,13 +3243,13 @@
 	swr->wr_id = tbl_idx;
 	swr->opcode = IB_WR_SEND;
 	swr->next = NULL;
-
-	udwr.ah = &rdev->sqp_ah->ib_ah;
-	udwr.remote_qpn = rdev->qp1_sqp->qplib_qp.id;
-	udwr.remote_qkey = rdev->qp1_sqp->qplib_qp.qkey;
+	gsi_sah = rdev->gsi_ctx.gsi_sah;
+	udwr.ah = &gsi_sah->ib_ah;
+	udwr.remote_qpn = gsi_sqp->qplib_qp.id;
+	udwr.remote_qkey = gsi_sqp->qplib_qp.qkey;
 
 	/* post data received  in the send queue */
-	rc = bnxt_re_post_send_shadow_qp(rdev, qp, swr);
+	rc = bnxt_re_post_send_shadow_qp(rdev, gsi_sqp, swr);
 
 	return 0;
 }
@@ -3029,12 +3316,12 @@
 		wc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
 }
 
-static void bnxt_re_process_res_shadow_qp_wc(struct bnxt_re_qp *qp,
+static void bnxt_re_process_res_shadow_qp_wc(struct bnxt_re_qp *gsi_sqp,
 					     struct ib_wc *wc,
 					     struct bnxt_qplib_cqe *cqe)
 {
-	struct bnxt_re_dev *rdev = qp->rdev;
-	struct bnxt_re_qp *qp1_qp = NULL;
+	struct bnxt_re_dev *rdev = gsi_sqp->rdev;
+	struct bnxt_re_qp *gsi_qp = NULL;
 	struct bnxt_qplib_cqe *orig_cqe = NULL;
 	struct bnxt_re_sqp_entries *sqp_entry = NULL;
 	int nw_type;
@@ -3044,13 +3331,13 @@
 
 	tbl_idx = cqe->wr_id;
 
-	sqp_entry = &rdev->sqp_tbl[tbl_idx];
-	qp1_qp = sqp_entry->qp1_qp;
+	sqp_entry = &rdev->gsi_ctx.sqp_tbl[tbl_idx];
+	gsi_qp = sqp_entry->qp1_qp;
 	orig_cqe = &sqp_entry->cqe;
 
 	wc->wr_id = sqp_entry->wrid;
 	wc->byte_len = orig_cqe->length;
-	wc->qp = &qp1_qp->ib_qp;
+	wc->qp = &gsi_qp->ib_qp;
 
 	wc->ex.imm_data = orig_cqe->immdata;
 	wc->src_qp = orig_cqe->src_qp;
@@ -3081,8 +3368,11 @@
 				      struct ib_wc *wc,
 				      struct bnxt_qplib_cqe *cqe)
 {
+	struct bnxt_re_dev *rdev;
+	u16 vlan_id = 0;
 	u8 nw_type;
 
+	rdev = qp->rdev;
 	wc->opcode = IB_WC_RECV;
 	wc->status = __rc_to_ib_wc_status(cqe->status);
 
@@ -3094,9 +3384,12 @@
 		memcpy(wc->smac, cqe->smac, ETH_ALEN);
 		wc->wc_flags |= IB_WC_WITH_SMAC;
 		if (cqe->flags & CQ_RES_UD_FLAGS_META_FORMAT_VLAN) {
-			wc->vlan_id = (cqe->cfa_meta & 0xFFF);
-			if (wc->vlan_id < 0x1000)
-				wc->wc_flags |= IB_WC_WITH_VLAN;
+			vlan_id = (cqe->cfa_meta & 0xFFF);
+		}
+		/* Mark only if vlan_id is non zero */
+		if (vlan_id && bnxt_re_check_if_vlan_valid(rdev, vlan_id)) {
+			wc->vlan_id = vlan_id;
+			wc->wc_flags |= IB_WC_WITH_VLAN;
 		}
 		nw_type = (cqe->flags & CQ_RES_UD_FLAGS_ROCE_IP_VER_MASK) >>
 			   CQ_RES_UD_FLAGS_ROCE_IP_VER_SFT;
@@ -3117,11 +3410,11 @@
 	rc = bnxt_re_bind_fence_mw(lib_qp);
 	if (!rc) {
 		lib_qp->sq.phantom_wqe_cnt++;
-		dev_dbg(&lib_qp->sq.hwq.pdev->dev,
-			"qp %#x sq->prod %#x sw_prod %#x phantom_wqe_cnt %d\n",
-			lib_qp->id, lib_qp->sq.hwq.prod,
-			HWQ_CMP(lib_qp->sq.hwq.prod, &lib_qp->sq.hwq),
-			lib_qp->sq.phantom_wqe_cnt);
+		ibdev_dbg(&qp->rdev->ibdev,
+			  "qp %#x sq->prod %#x sw_prod %#x phantom_wqe_cnt %d\n",
+			  lib_qp->id, lib_qp->sq.hwq.prod,
+			  HWQ_CMP(lib_qp->sq.hwq.prod, &lib_qp->sq.hwq),
+			  lib_qp->sq.phantom_wqe_cnt);
 	}
 
 	spin_unlock_irqrestore(&qp->sq_lock, flags);
@@ -3131,7 +3424,7 @@
 int bnxt_re_poll_cq(struct ib_cq *ib_cq, int num_entries, struct ib_wc *wc)
 {
 	struct bnxt_re_cq *cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq);
-	struct bnxt_re_qp *qp;
+	struct bnxt_re_qp *qp, *sh_qp;
 	struct bnxt_qplib_cqe *cqe;
 	int i, ncqe, budget;
 	struct bnxt_qplib_q *sq;
@@ -3144,7 +3437,7 @@
 	budget = min_t(u32, num_entries, cq->max_cql);
 	num_entries = budget;
 	if (!cq->cql) {
-		dev_err(rdev_to_dev(cq->rdev), "POLL CQ : no CQL to use");
+		ibdev_err(&cq->rdev->ibdev, "POLL CQ : no CQL to use");
 		goto exit;
 	}
 	cqe = &cq->cql[0];
@@ -3157,8 +3450,8 @@
 				qp = container_of(lib_qp,
 						  struct bnxt_re_qp, qplib_qp);
 				if (send_phantom_wqe(qp) == -ENOMEM)
-					dev_err(rdev_to_dev(cq->rdev),
-						"Phantom failed! Scheduled to send again\n");
+					ibdev_err(&cq->rdev->ibdev,
+						  "Phantom failed! Scheduled to send again\n");
 				else
 					sq->send_phantom = false;
 			}
@@ -3182,8 +3475,7 @@
 				 (unsigned long)(cqe->qp_handle),
 				 struct bnxt_re_qp, qplib_qp);
 			if (!qp) {
-				dev_err(rdev_to_dev(cq->rdev),
-					"POLL CQ : bad QP handle");
+				ibdev_err(&cq->rdev->ibdev, "POLL CQ : bad QP handle");
 				continue;
 			}
 			wc->qp = &qp->ib_qp;
@@ -3195,8 +3487,9 @@
 
 			switch (cqe->opcode) {
 			case CQ_BASE_CQE_TYPE_REQ:
-				if (qp->rdev->qp1_sqp && qp->qplib_qp.id ==
-				    qp->rdev->qp1_sqp->qplib_qp.id) {
+				sh_qp = qp->rdev->gsi_ctx.gsi_sqp;
+				if (sh_qp &&
+				    qp->qplib_qp.id == sh_qp->qplib_qp.id) {
 					/* Handle this completion with
 					 * the stored completion
 					 */
@@ -3222,7 +3515,7 @@
 				 * stored in the table
 				 */
 				tbl_idx = cqe->wr_id;
-				sqp_entry = &cq->rdev->sqp_tbl[tbl_idx];
+				sqp_entry = &cq->rdev->gsi_ctx.sqp_tbl[tbl_idx];
 				wc->wr_id = sqp_entry->wrid;
 				bnxt_re_process_res_rawqp1_wc(wc, cqe);
 				break;
@@ -3230,8 +3523,9 @@
 				bnxt_re_process_res_rc_wc(wc, cqe);
 				break;
 			case CQ_BASE_CQE_TYPE_RES_UD:
-				if (qp->rdev->qp1_sqp && qp->qplib_qp.id ==
-				    qp->rdev->qp1_sqp->qplib_qp.id) {
+				sh_qp = qp->rdev->gsi_ctx.gsi_sqp;
+				if (sh_qp &&
+				    qp->qplib_qp.id == sh_qp->qplib_qp.id) {
 					/* Handle this completion with
 					 * the stored completion
 					 */
@@ -3246,9 +3540,9 @@
 				bnxt_re_process_res_ud_wc(qp, wc, cqe);
 				break;
 			default:
-				dev_err(rdev_to_dev(cq->rdev),
-					"POLL CQ : type 0x%x not handled",
-					cqe->opcode);
+				ibdev_err(&cq->rdev->ibdev,
+					  "POLL CQ : type 0x%x not handled",
+					  cqe->opcode);
 				continue;
 			}
 			wc++;
@@ -3341,7 +3635,7 @@
 
 	rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Dereg MR failed: %#x\n", rc);
+		ibdev_err(&rdev->ibdev, "Dereg MR failed: %#x\n", rc);
 		return rc;
 	}
 
@@ -3380,7 +3674,7 @@
 }
 
 struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type,
-			       u32 max_num_sg, struct ib_udata *udata)
+			       u32 max_num_sg)
 {
 	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
 	struct bnxt_re_dev *rdev = pd->rdev;
@@ -3388,7 +3682,7 @@
 	int rc;
 
 	if (type != IB_MR_TYPE_MEM_REG) {
-		dev_dbg(rdev_to_dev(rdev), "MR type 0x%x not supported", type);
+		ibdev_dbg(&rdev->ibdev, "MR type 0x%x not supported", type);
 		return ERR_PTR(-EINVAL);
 	}
 	if (max_num_sg > MAX_PBL_LVL_1_PGS)
@@ -3418,8 +3712,8 @@
 	rc = bnxt_qplib_alloc_fast_reg_page_list(&rdev->qplib_res,
 						 &mr->qplib_frpl, max_num_sg);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev),
-			"Failed to allocate HW FR page list");
+		ibdev_err(&rdev->ibdev,
+			  "Failed to allocate HW FR page list");
 		goto fail_mr;
 	}
 
@@ -3454,7 +3748,7 @@
 			       CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B);
 	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mw->qplib_mw);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Allocate MW failed!");
+		ibdev_err(&rdev->ibdev, "Allocate MW failed!");
 		goto fail;
 	}
 	mw->ib_mw.rkey = mw->qplib_mw.rkey;
@@ -3475,7 +3769,7 @@
 
 	rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mw->qplib_mw);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Free MW failed: %#x\n", rc);
+		ibdev_err(&rdev->ibdev, "Free MW failed: %#x\n", rc);
 		return rc;
 	}
 
@@ -3484,23 +3778,6 @@
 	return rc;
 }
 
-static int bnxt_re_page_size_ok(int page_shift)
-{
-	switch (page_shift) {
-	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4K:
-	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_8K:
-	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_64K:
-	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_2M:
-	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_256K:
-	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1M:
-	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4M:
-	case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1G:
-		return 1;
-	default:
-		return 0;
-	}
-}
-
 static int fill_umem_pbl_tbl(struct ib_umem *umem, u64 *pbl_tbl_orig,
 			     int page_shift)
 {
@@ -3508,7 +3785,7 @@
 	u64 page_size =  BIT_ULL(page_shift);
 	struct ib_block_iter biter;
 
-	rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap, page_size)
+	rdma_umem_for_each_dma_block(umem, &biter, page_size)
 		*pbl_tbl++ = rdma_block_iter_dma_address(&biter);
 
 	return pbl_tbl - pbl_tbl_orig;
@@ -3524,11 +3801,12 @@
 	struct bnxt_re_mr *mr;
 	struct ib_umem *umem;
 	u64 *pbl_tbl = NULL;
-	int umem_pgs, page_shift, rc;
+	unsigned long page_size;
+	int umem_pgs, rc;
 
 	if (length > BNXT_RE_MAX_MR_SIZE) {
-		dev_err(rdev_to_dev(rdev), "MR Size: %lld > Max supported:%lld\n",
-			length, BNXT_RE_MAX_MR_SIZE);
+		ibdev_err(&rdev->ibdev, "MR Size: %lld > Max supported:%lld\n",
+			  length, BNXT_RE_MAX_MR_SIZE);
 		return ERR_PTR(-ENOMEM);
 	}
 
@@ -3543,59 +3821,51 @@
 
 	rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to allocate MR");
+		ibdev_err(&rdev->ibdev, "Failed to allocate MR");
 		goto free_mr;
 	}
 	/* The fixed portion of the rkey is the same as the lkey */
 	mr->ib_mr.rkey = mr->qplib_mr.rkey;
 
-	umem = ib_umem_get(udata, start, length, mr_access_flags, 0);
+	umem = ib_umem_get(&rdev->ibdev, start, length, mr_access_flags);
 	if (IS_ERR(umem)) {
-		dev_err(rdev_to_dev(rdev), "Failed to get umem");
+		ibdev_err(&rdev->ibdev, "Failed to get umem");
 		rc = -EFAULT;
 		goto free_mrw;
 	}
 	mr->ib_umem = umem;
 
 	mr->qplib_mr.va = virt_addr;
-	umem_pgs = ib_umem_page_count(umem);
-	if (!umem_pgs) {
-		dev_err(rdev_to_dev(rdev), "umem is invalid!");
-		rc = -EINVAL;
+	page_size = ib_umem_find_best_pgsz(
+		umem, BNXT_RE_PAGE_SIZE_4K | BNXT_RE_PAGE_SIZE_2M, virt_addr);
+	if (!page_size) {
+		ibdev_err(&rdev->ibdev, "umem page size unsupported!");
+		rc = -EFAULT;
 		goto free_umem;
 	}
 	mr->qplib_mr.total_size = length;
 
-	pbl_tbl = kcalloc(umem_pgs, sizeof(u64 *), GFP_KERNEL);
+	if (page_size == BNXT_RE_PAGE_SIZE_4K &&
+	    length > BNXT_RE_MAX_MR_SIZE_LOW) {
+		ibdev_err(&rdev->ibdev, "Requested MR Sz:%llu Max sup:%llu",
+			  length, (u64)BNXT_RE_MAX_MR_SIZE_LOW);
+		rc = -EINVAL;
+		goto free_umem;
+	}
+
+	umem_pgs = ib_umem_num_dma_blocks(umem, page_size);
+	pbl_tbl = kcalloc(umem_pgs, sizeof(*pbl_tbl), GFP_KERNEL);
 	if (!pbl_tbl) {
 		rc = -ENOMEM;
 		goto free_umem;
 	}
 
-	page_shift = __ffs(ib_umem_find_best_pgsz(umem,
-				BNXT_RE_PAGE_SIZE_4K | BNXT_RE_PAGE_SIZE_2M,
-				virt_addr));
-
-	if (!bnxt_re_page_size_ok(page_shift)) {
-		dev_err(rdev_to_dev(rdev), "umem page size unsupported!");
-		rc = -EFAULT;
-		goto fail;
-	}
-
-	if (page_shift == BNXT_RE_PAGE_SHIFT_4K &&
-	    length > BNXT_RE_MAX_MR_SIZE_LOW) {
-		dev_err(rdev_to_dev(rdev), "Requested MR Sz:%llu Max sup:%llu",
-			length,	(u64)BNXT_RE_MAX_MR_SIZE_LOW);
-		rc = -EINVAL;
-		goto fail;
-	}
-
 	/* Map umem buf ptrs to the PBL */
-	umem_pgs = fill_umem_pbl_tbl(umem, pbl_tbl, page_shift);
+	umem_pgs = fill_umem_pbl_tbl(umem, pbl_tbl, order_base_2(page_size));
 	rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, pbl_tbl,
-			       umem_pgs, false, 1 << page_shift);
+			       umem_pgs, false, page_size);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to register user MR");
+		ibdev_err(&rdev->ibdev, "Failed to register user MR");
 		goto fail;
 	}
 
@@ -3628,12 +3898,11 @@
 	u32 chip_met_rev_num = 0;
 	int rc;
 
-	dev_dbg(rdev_to_dev(rdev), "ABI version requested %u",
-		ibdev->ops.uverbs_abi_ver);
+	ibdev_dbg(ibdev, "ABI version requested %u", ibdev->ops.uverbs_abi_ver);
 
 	if (ibdev->ops.uverbs_abi_ver != BNXT_RE_ABI_VERSION) {
-		dev_dbg(rdev_to_dev(rdev), " is different from the device %d ",
-			BNXT_RE_ABI_VERSION);
+		ibdev_dbg(ibdev, " is different from the device %d ",
+			  BNXT_RE_ABI_VERSION);
 		return -EPERM;
 	}
 
@@ -3647,10 +3916,10 @@
 	spin_lock_init(&uctx->sh_lock);
 
 	resp.comp_mask = BNXT_RE_UCNTX_CMASK_HAVE_CCTX;
-	chip_met_rev_num = rdev->chip_ctx.chip_num;
-	chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_rev & 0xFF) <<
+	chip_met_rev_num = rdev->chip_ctx->chip_num;
+	chip_met_rev_num |= ((u32)rdev->chip_ctx->chip_rev & 0xFF) <<
 			     BNXT_RE_CHIP_ID0_CHIP_REV_SFT;
-	chip_met_rev_num |= ((u32)rdev->chip_ctx.chip_metal & 0xFF) <<
+	chip_met_rev_num |= ((u32)rdev->chip_ctx->chip_metal & 0xFF) <<
 			     BNXT_RE_CHIP_ID0_CHIP_MET_SFT;
 	resp.chip_id0 = chip_met_rev_num;
 	/* Future extension of chip info */
@@ -3665,7 +3934,7 @@
 
 	rc = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp)));
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to copy user context");
+		ibdev_err(ibdev, "Failed to copy user context");
 		rc = -EFAULT;
 		goto cfail;
 	}
@@ -3715,15 +3984,14 @@
 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 		if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
 				       PAGE_SIZE, vma->vm_page_prot)) {
-			dev_err(rdev_to_dev(rdev), "Failed to map DPI");
+			ibdev_err(&rdev->ibdev, "Failed to map DPI");
 			return -EAGAIN;
 		}
 	} else {
 		pfn = virt_to_phys(uctx->shpg) >> PAGE_SHIFT;
 		if (remap_pfn_range(vma, vma->vm_start,
 				    pfn, PAGE_SIZE, vma->vm_page_prot)) {
-			dev_err(rdev_to_dev(rdev),
-				"Failed to map shared page");
+			ibdev_err(&rdev->ibdev, "Failed to map shared page");
 			return -EAGAIN;
 		}
 	}
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
index 31662b1..9a8130b 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -122,12 +122,6 @@
 	u64				*page_list;
 };
 
-struct bnxt_re_fmr {
-	struct bnxt_re_dev	*rdev;
-	struct ib_fmr		ib_fmr;
-	struct bnxt_qplib_mrw	qplib_fmr;
-};
-
 struct bnxt_re_mw {
 	struct bnxt_re_dev	*rdev;
 	struct ib_mw		ib_mw;
@@ -142,12 +136,19 @@
 	spinlock_t		sh_lock;	/* protect shpg */
 };
 
+static inline u16 bnxt_re_get_swqe_size(int nsge)
+{
+	return sizeof(struct sq_send_hdr) + nsge * sizeof(struct sq_sge);
+}
+
+static inline u16 bnxt_re_get_rwqe_size(int nsge)
+{
+	return sizeof(struct rq_wqe_hdr) + (nsge * sizeof(struct sq_sge));
+}
+
 int bnxt_re_query_device(struct ib_device *ibdev,
 			 struct ib_device_attr *ib_attr,
 			 struct ib_udata *udata);
-int bnxt_re_modify_device(struct ib_device *ibdev,
-			  int device_modify_mask,
-			  struct ib_device_modify *device_modify);
 int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num,
 		       struct ib_port_attr *port_attr);
 int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num,
@@ -162,12 +163,12 @@
 enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev,
 					    u8 port_num);
 int bnxt_re_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
-void bnxt_re_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
-int bnxt_re_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
+int bnxt_re_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+int bnxt_re_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr,
 		      struct ib_udata *udata);
 int bnxt_re_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
 int bnxt_re_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
-void bnxt_re_destroy_ah(struct ib_ah *ah, u32 flags);
+int bnxt_re_destroy_ah(struct ib_ah *ah, u32 flags);
 int bnxt_re_create_srq(struct ib_srq *srq,
 		       struct ib_srq_init_attr *srq_init_attr,
 		       struct ib_udata *udata);
@@ -175,7 +176,7 @@
 		       enum ib_srq_attr_mask srq_attr_mask,
 		       struct ib_udata *udata);
 int bnxt_re_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
-void bnxt_re_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
+int bnxt_re_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
 int bnxt_re_post_srq_recv(struct ib_srq *srq, const struct ib_recv_wr *recv_wr,
 			  const struct ib_recv_wr **bad_recv_wr);
 struct ib_qp *bnxt_re_create_qp(struct ib_pd *pd,
@@ -192,7 +193,7 @@
 		      const struct ib_recv_wr **bad_recv_wr);
 int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		      struct ib_udata *udata);
-void bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+int bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
 int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
 struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
@@ -200,7 +201,7 @@
 int bnxt_re_map_mr_sg(struct ib_mr *ib_mr, struct scatterlist *sg, int sg_nents,
 		      unsigned int *sg_offset);
 struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type mr_type,
-			       u32 max_num_sg, struct ib_udata *udata);
+			       u32 max_num_sg);
 int bnxt_re_dereg_mr(struct ib_mr *mr, struct ib_udata *udata);
 struct ib_mw *bnxt_re_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type,
 			       struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c
index cfe5f47..9ef6aea 100644
--- a/drivers/infiniband/hw/bnxt_re/main.c
+++ b/drivers/infiniband/hw/bnxt_re/main.c
@@ -78,28 +78,56 @@
 /* Mutex to protect the list of bnxt_re devices added */
 static DEFINE_MUTEX(bnxt_re_dev_lock);
 static struct workqueue_struct *bnxt_re_wq;
-static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev);
+static void bnxt_re_remove_device(struct bnxt_re_dev *rdev);
+static void bnxt_re_dealloc_driver(struct ib_device *ib_dev);
+static void bnxt_re_stop_irq(void *handle);
+
+static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode)
+{
+	struct bnxt_qplib_chip_ctx *cctx;
+
+	cctx = rdev->chip_ctx;
+	cctx->modes.wqe_mode = bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) ?
+			       mode : BNXT_QPLIB_WQE_MODE_STATIC;
+}
 
 static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev)
 {
+	struct bnxt_qplib_chip_ctx *chip_ctx;
+
+	if (!rdev->chip_ctx)
+		return;
+	chip_ctx = rdev->chip_ctx;
+	rdev->chip_ctx = NULL;
 	rdev->rcfw.res = NULL;
 	rdev->qplib_res.cctx = NULL;
+	rdev->qplib_res.pdev = NULL;
+	rdev->qplib_res.netdev = NULL;
+	kfree(chip_ctx);
 }
 
-static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev)
+static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode)
 {
+	struct bnxt_qplib_chip_ctx *chip_ctx;
 	struct bnxt_en_dev *en_dev;
 	struct bnxt *bp;
 
 	en_dev = rdev->en_dev;
 	bp = netdev_priv(en_dev->net);
 
-	rdev->chip_ctx.chip_num = bp->chip_num;
+	chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL);
+	if (!chip_ctx)
+		return -ENOMEM;
+	chip_ctx->chip_num = bp->chip_num;
+	chip_ctx->hw_stats_size = bp->hw_ring_stats_size;
+
+	rdev->chip_ctx = chip_ctx;
 	/* rest members to follow eventually */
 
-	rdev->qplib_res.cctx = &rdev->chip_ctx;
+	rdev->qplib_res.cctx = rdev->chip_ctx;
 	rdev->rcfw.res = &rdev->qplib_res;
 
+	bnxt_re_set_drv_mode(rdev, wqe_mode);
 	return 0;
 }
 
@@ -119,61 +147,76 @@
  * reserved for the function. The driver may choose to allocate fewer
  * resources than the firmware maximum.
  */
+static void bnxt_re_limit_pf_res(struct bnxt_re_dev *rdev)
+{
+	struct bnxt_qplib_dev_attr *attr;
+	struct bnxt_qplib_ctx *ctx;
+	int i;
+
+	attr = &rdev->dev_attr;
+	ctx = &rdev->qplib_ctx;
+
+	ctx->qpc_count = min_t(u32, BNXT_RE_MAX_QPC_COUNT,
+			       attr->max_qp);
+	ctx->mrw_count = BNXT_RE_MAX_MRW_COUNT_256K;
+	/* Use max_mr from fw since max_mrw does not get set */
+	ctx->mrw_count = min_t(u32, ctx->mrw_count, attr->max_mr);
+	ctx->srqc_count = min_t(u32, BNXT_RE_MAX_SRQC_COUNT,
+				attr->max_srq);
+	ctx->cq_count = min_t(u32, BNXT_RE_MAX_CQ_COUNT, attr->max_cq);
+	if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx))
+		for (i = 0; i < MAX_TQM_ALLOC_REQ; i++)
+			rdev->qplib_ctx.tqm_ctx.qcount[i] =
+			rdev->dev_attr.tqm_alloc_reqs[i];
+}
+
+static void bnxt_re_limit_vf_res(struct bnxt_qplib_ctx *qplib_ctx, u32 num_vf)
+{
+	struct bnxt_qplib_vf_res *vf_res;
+	u32 mrws = 0;
+	u32 vf_pct;
+	u32 nvfs;
+
+	vf_res = &qplib_ctx->vf_res;
+	/*
+	 * Reserve a set of resources for the PF. Divide the remaining
+	 * resources among the VFs
+	 */
+	vf_pct = 100 - BNXT_RE_PCT_RSVD_FOR_PF;
+	nvfs = num_vf;
+	num_vf = 100 * num_vf;
+	vf_res->max_qp_per_vf = (qplib_ctx->qpc_count * vf_pct) / num_vf;
+	vf_res->max_srq_per_vf = (qplib_ctx->srqc_count * vf_pct) / num_vf;
+	vf_res->max_cq_per_vf = (qplib_ctx->cq_count * vf_pct) / num_vf;
+	/*
+	 * The driver allows many more MRs than other resources. If the
+	 * firmware does also, then reserve a fixed amount for the PF and
+	 * divide the rest among VFs. VFs may use many MRs for NFS
+	 * mounts, ISER, NVME applications, etc. If the firmware severely
+	 * restricts the number of MRs, then let PF have half and divide
+	 * the rest among VFs, as for the other resource types.
+	 */
+	if (qplib_ctx->mrw_count < BNXT_RE_MAX_MRW_COUNT_64K) {
+		mrws = qplib_ctx->mrw_count * vf_pct;
+		nvfs = num_vf;
+	} else {
+		mrws = qplib_ctx->mrw_count - BNXT_RE_RESVD_MR_FOR_PF;
+	}
+	vf_res->max_mrw_per_vf = (mrws / nvfs);
+	vf_res->max_gid_per_vf = BNXT_RE_MAX_GID_PER_VF;
+}
+
 static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev)
 {
-	u32 vf_qps = 0, vf_srqs = 0, vf_cqs = 0, vf_mrws = 0, vf_gids = 0;
-	u32 i;
-	u32 vf_pct;
 	u32 num_vfs;
-	struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
 
-	rdev->qplib_ctx.qpc_count = min_t(u32, BNXT_RE_MAX_QPC_COUNT,
-					  dev_attr->max_qp);
+	memset(&rdev->qplib_ctx.vf_res, 0, sizeof(struct bnxt_qplib_vf_res));
+	bnxt_re_limit_pf_res(rdev);
 
-	rdev->qplib_ctx.mrw_count = BNXT_RE_MAX_MRW_COUNT_256K;
-	/* Use max_mr from fw since max_mrw does not get set */
-	rdev->qplib_ctx.mrw_count = min_t(u32, rdev->qplib_ctx.mrw_count,
-					  dev_attr->max_mr);
-	rdev->qplib_ctx.srqc_count = min_t(u32, BNXT_RE_MAX_SRQC_COUNT,
-					   dev_attr->max_srq);
-	rdev->qplib_ctx.cq_count = min_t(u32, BNXT_RE_MAX_CQ_COUNT,
-					 dev_attr->max_cq);
-
-	for (i = 0; i < MAX_TQM_ALLOC_REQ; i++)
-		rdev->qplib_ctx.tqm_count[i] =
-		rdev->dev_attr.tqm_alloc_reqs[i];
-
-	if (rdev->num_vfs) {
-		/*
-		 * Reserve a set of resources for the PF. Divide the remaining
-		 * resources among the VFs
-		 */
-		vf_pct = 100 - BNXT_RE_PCT_RSVD_FOR_PF;
-		num_vfs = 100 * rdev->num_vfs;
-		vf_qps = (rdev->qplib_ctx.qpc_count * vf_pct) / num_vfs;
-		vf_srqs = (rdev->qplib_ctx.srqc_count * vf_pct) / num_vfs;
-		vf_cqs = (rdev->qplib_ctx.cq_count * vf_pct) / num_vfs;
-		/*
-		 * The driver allows many more MRs than other resources. If the
-		 * firmware does also, then reserve a fixed amount for the PF
-		 * and divide the rest among VFs. VFs may use many MRs for NFS
-		 * mounts, ISER, NVME applications, etc. If the firmware
-		 * severely restricts the number of MRs, then let PF have
-		 * half and divide the rest among VFs, as for the other
-		 * resource types.
-		 */
-		if (rdev->qplib_ctx.mrw_count < BNXT_RE_MAX_MRW_COUNT_64K)
-			vf_mrws = rdev->qplib_ctx.mrw_count * vf_pct / num_vfs;
-		else
-			vf_mrws = (rdev->qplib_ctx.mrw_count -
-				   BNXT_RE_RESVD_MR_FOR_PF) / rdev->num_vfs;
-		vf_gids = BNXT_RE_MAX_GID_PER_VF;
-	}
-	rdev->qplib_ctx.vf_res.max_mrw_per_vf = vf_mrws;
-	rdev->qplib_ctx.vf_res.max_gid_per_vf = vf_gids;
-	rdev->qplib_ctx.vf_res.max_qp_per_vf = vf_qps;
-	rdev->qplib_ctx.vf_res.max_srq_per_vf = vf_srqs;
-	rdev->qplib_ctx.vf_res.max_cq_per_vf = vf_cqs;
+	num_vfs =  bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) ?
+			BNXT_RE_GEN_P5_MAX_VF : rdev->num_vfs;
+	if (num_vfs)
+		bnxt_re_limit_vf_res(&rdev->qplib_ctx, num_vfs);
 }
 
 /* for handling bnxt_en callbacks later */
@@ -193,9 +236,11 @@
 		return;
 
 	rdev->num_vfs = num_vfs;
-	bnxt_re_set_resource_limits(rdev);
-	bnxt_qplib_set_func_resources(&rdev->qplib_res, &rdev->rcfw,
-				      &rdev->qplib_ctx);
+	if (!bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx)) {
+		bnxt_re_set_resource_limits(rdev);
+		bnxt_qplib_set_func_resources(&rdev->qplib_res, &rdev->rcfw,
+					      &rdev->qplib_ctx);
+	}
 }
 
 static void bnxt_re_shutdown(void *p)
@@ -204,8 +249,10 @@
 
 	if (!rdev)
 		return;
-
-	bnxt_re_ib_unreg(rdev);
+	ASSERT_RTNL();
+	/* Release the MSIx vectors before queuing unregister */
+	bnxt_re_stop_irq(rdev);
+	ib_unregister_device_queued(&rdev->ibdev);
 }
 
 static void bnxt_re_stop_irq(void *handle)
@@ -237,7 +284,7 @@
 		 * to f/w will timeout and that will set the
 		 * timeout bit.
 		 */
-		dev_err(rdev_to_dev(rdev), "Failed to re-start IRQs\n");
+		ibdev_err(&rdev->ibdev, "Failed to re-start IRQs\n");
 		return;
 	}
 
@@ -254,8 +301,8 @@
 		rc = bnxt_qplib_nq_start_irq(nq, indx - 1,
 					     msix_ent[indx].vector, false);
 		if (rc)
-			dev_warn(rdev_to_dev(rdev),
-				 "Failed to reinit NQ index %d\n", indx - 1);
+			ibdev_warn(&rdev->ibdev, "Failed to reinit NQ index %d\n",
+				   indx - 1);
 	}
 }
 
@@ -341,9 +388,9 @@
 		goto done;
 	}
 	if (num_msix_got != num_msix_want) {
-		dev_warn(rdev_to_dev(rdev),
-			 "Requested %d MSI-X vectors, got %d\n",
-			 num_msix_want, num_msix_got);
+		ibdev_warn(&rdev->ibdev,
+			   "Requested %d MSI-X vectors, got %d\n",
+			   num_msix_want, num_msix_got);
 	}
 	rdev->num_msix = num_msix_got;
 done:
@@ -390,14 +437,14 @@
 			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
 	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
 	if (rc)
-		dev_err(rdev_to_dev(rdev),
-			"Failed to free HW ring:%d :%#x", req.ring_id, rc);
+		ibdev_err(&rdev->ibdev, "Failed to free HW ring:%d :%#x",
+			  req.ring_id, rc);
 	return rc;
 }
 
-static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev, dma_addr_t *dma_arr,
-				  int pages, int type, u32 ring_mask,
-				  u32 map_index, u16 *fw_ring_id)
+static int bnxt_re_net_ring_alloc(struct bnxt_re_dev *rdev,
+				  struct bnxt_re_ring_attr *ring_attr,
+				  u16 *fw_ring_id)
 {
 	struct bnxt_en_dev *en_dev = rdev->en_dev;
 	struct hwrm_ring_alloc_input req = {0};
@@ -411,18 +458,18 @@
 	memset(&fw_msg, 0, sizeof(fw_msg));
 	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_RING_ALLOC, -1, -1);
 	req.enables = 0;
-	req.page_tbl_addr =  cpu_to_le64(dma_arr[0]);
-	if (pages > 1) {
+	req.page_tbl_addr =  cpu_to_le64(ring_attr->dma_arr[0]);
+	if (ring_attr->pages > 1) {
 		/* Page size is in log2 units */
 		req.page_size = BNXT_PAGE_SHIFT;
 		req.page_tbl_depth = 1;
 	}
 	req.fbo = 0;
 	/* Association of ring index with doorbell index and MSIX number */
-	req.logical_id = cpu_to_le16(map_index);
-	req.length = cpu_to_le32(ring_mask + 1);
-	req.ring_type = type;
-	req.int_mode = RING_ALLOC_REQ_INT_MODE_MSIX;
+	req.logical_id = cpu_to_le16(ring_attr->lrid);
+	req.length = cpu_to_le32(ring_attr->depth + 1);
+	req.ring_type = ring_attr->type;
+	req.int_mode = ring_attr->mode;
 	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
 			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
 	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
@@ -451,8 +498,8 @@
 			    sizeof(req), DFLT_HWRM_CMD_TIMEOUT);
 	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
 	if (rc)
-		dev_err(rdev_to_dev(rdev),
-			"Failed to free HW stats context %#x", rc);
+		ibdev_err(&rdev->ibdev, "Failed to free HW stats context %#x",
+			  rc);
 
 	return rc;
 }
@@ -461,6 +508,7 @@
 				       dma_addr_t dma_map,
 				       u32 *fw_stats_ctx_id)
 {
+	struct bnxt_qplib_chip_ctx *chip_ctx = rdev->chip_ctx;
 	struct hwrm_stat_ctx_alloc_output resp = {0};
 	struct hwrm_stat_ctx_alloc_input req = {0};
 	struct bnxt_en_dev *en_dev = rdev->en_dev;
@@ -477,7 +525,7 @@
 	bnxt_re_init_hwrm_hdr(rdev, (void *)&req, HWRM_STAT_CTX_ALLOC, -1, -1);
 	req.update_period_ms = cpu_to_le32(1000);
 	req.stats_dma_addr = cpu_to_le64(dma_map);
-	req.stats_dma_length = cpu_to_le16(sizeof(struct ctx_hw_stats_ext));
+	req.stats_dma_length = cpu_to_le16(chip_ctx->hw_stats_size);
 	req.stat_ctx_flags = STAT_CTX_ALLOC_REQ_STAT_CTX_FLAGS_ROCE;
 	bnxt_re_fill_fw_msg(&fw_msg, (void *)&req, sizeof(req), (void *)&resp,
 			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
@@ -507,17 +555,12 @@
 
 static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev)
 {
-	struct bnxt_re_dev *rdev;
+	struct ib_device *ibdev =
+		ib_device_get_by_netdev(netdev, RDMA_DRIVER_BNXT_RE);
+	if (!ibdev)
+		return NULL;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(rdev, &bnxt_re_dev_list, list) {
-		if (rdev->netdev == netdev) {
-			rcu_read_unlock();
-			return rdev;
-		}
-	}
-	rcu_read_unlock();
-	return NULL;
+	return container_of(ibdev, struct bnxt_re_dev, ibdev);
 }
 
 static void bnxt_re_dev_unprobe(struct net_device *netdev,
@@ -591,11 +634,6 @@
 	.attrs = bnxt_re_attributes,
 };
 
-static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev)
-{
-	ib_unregister_device(&rdev->ibdev);
-}
-
 static const struct ib_device_ops bnxt_re_dev_ops = {
 	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_BNXT_RE,
@@ -610,6 +648,7 @@
 	.create_cq = bnxt_re_create_cq,
 	.create_qp = bnxt_re_create_qp,
 	.create_srq = bnxt_re_create_srq,
+	.dealloc_driver = bnxt_re_dealloc_driver,
 	.dealloc_pd = bnxt_re_dealloc_pd,
 	.dealloc_ucontext = bnxt_re_dealloc_ucontext,
 	.del_gid = bnxt_re_del_gid,
@@ -626,7 +665,6 @@
 	.map_mr_sg = bnxt_re_map_mr_sg,
 	.mmap = bnxt_re_mmap,
 	.modify_ah = bnxt_re_modify_ah,
-	.modify_device = bnxt_re_modify_device,
 	.modify_qp = bnxt_re_modify_qp,
 	.modify_srq = bnxt_re_modify_srq,
 	.poll_cq = bnxt_re_poll_cq,
@@ -661,7 +699,7 @@
 
 	bnxt_qplib_get_guid(rdev->netdev->dev_addr, (u8 *)&ibdev->node_guid);
 
-	ibdev->num_comp_vectors	= 1;
+	ibdev->num_comp_vectors	= rdev->num_msix - 1;
 	ibdev->dev.parent = &rdev->en_dev->pdev->dev;
 	ibdev->local_dma_lkey = BNXT_QPLIB_RSVD_LKEY;
 
@@ -700,22 +738,19 @@
 	if (ret)
 		return ret;
 
-	return ib_register_device(ibdev, "bnxt_re%d");
+	dma_set_max_seg_size(&rdev->en_dev->pdev->dev, UINT_MAX);
+	return ib_register_device(ibdev, "bnxt_re%d", &rdev->en_dev->pdev->dev);
 }
 
 static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev)
 {
 	dev_put(rdev->netdev);
 	rdev->netdev = NULL;
-
 	mutex_lock(&bnxt_re_dev_lock);
 	list_del_rcu(&rdev->list);
 	mutex_unlock(&bnxt_re_dev_lock);
 
 	synchronize_rcu();
-
-	ib_dealloc_device(&rdev->ibdev);
-	/* rdev is gone */
 }
 
 static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev,
@@ -726,8 +761,8 @@
 	/* Allocate bnxt_re_dev instance here */
 	rdev = ib_alloc_device(bnxt_re_dev, ibdev);
 	if (!rdev) {
-		dev_err(NULL, "%s: bnxt_re_dev allocation failure!",
-			ROCE_DRV_MODULE_NAME);
+		ibdev_err(NULL, "%s: bnxt_re_dev allocation failure!",
+			  ROCE_DRV_MODULE_NAME);
 		return NULL;
 	}
 	/* Default values */
@@ -857,8 +892,8 @@
 	int rc = 0;
 
 	if (!srq) {
-		dev_err(NULL, "%s: SRQ is NULL, SRQN not handled",
-			ROCE_DRV_MODULE_NAME);
+		ibdev_err(NULL, "%s: SRQ is NULL, SRQN not handled",
+			  ROCE_DRV_MODULE_NAME);
 		rc = -EINVAL;
 		goto done;
 	}
@@ -885,8 +920,8 @@
 					     qplib_cq);
 
 	if (!cq) {
-		dev_err(NULL, "%s: CQ is NULL, CQN not handled",
-			ROCE_DRV_MODULE_NAME);
+		ibdev_err(NULL, "%s: CQ is NULL, CQN not handled",
+			  ROCE_DRV_MODULE_NAME);
 		return -EINVAL;
 	}
 	if (cq->ib_cq.comp_handler) {
@@ -897,10 +932,14 @@
 	return 0;
 }
 
+#define BNXT_RE_GEN_P5_PF_NQ_DB		0x10000
+#define BNXT_RE_GEN_P5_VF_NQ_DB		0x4000
 static u32 bnxt_re_get_nqdb_offset(struct bnxt_re_dev *rdev, u16 indx)
 {
-	return bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx) ?
-				0x10000 : rdev->msix_entries[indx].db_offset;
+	return bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx) ?
+		(rdev->is_virtfn ? BNXT_RE_GEN_P5_VF_NQ_DB :
+				   BNXT_RE_GEN_P5_PF_NQ_DB) :
+				   rdev->msix_entries[indx].db_offset;
 }
 
 static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev)
@@ -929,8 +968,8 @@
 					  db_offt, &bnxt_re_cqn_handler,
 					  &bnxt_re_srqn_handler);
 		if (rc) {
-			dev_err(rdev_to_dev(rdev),
-				"Failed to enable NQ with rc = 0x%x", rc);
+			ibdev_err(&rdev->ibdev,
+				  "Failed to enable NQ with rc = 0x%x", rc);
 			goto fail;
 		}
 		num_vec_enabled++;
@@ -948,10 +987,10 @@
 	int i;
 
 	for (i = 0; i < rdev->num_msix - 1; i++) {
-		type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+		type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
 		bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type);
-		rdev->nq[i].res = NULL;
 		bnxt_qplib_free_nq(&rdev->nq[i]);
+		rdev->nq[i].res = NULL;
 	}
 }
 
@@ -972,10 +1011,9 @@
 
 static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev)
 {
+	struct bnxt_re_ring_attr rattr = {};
 	int num_vec_created = 0;
-	dma_addr_t *pg_map;
 	int rc = 0, i;
-	int pages;
 	u8 type;
 
 	/* Configure and allocate resources for qplib */
@@ -997,26 +1035,28 @@
 		goto dealloc_res;
 
 	for (i = 0; i < rdev->num_msix - 1; i++) {
-		rdev->nq[i].res = &rdev->qplib_res;
-		rdev->nq[i].hwq.max_elements = BNXT_RE_MAX_CQ_COUNT +
-			BNXT_RE_MAX_SRQC_COUNT + 2;
-		rc = bnxt_qplib_alloc_nq(rdev->en_dev->pdev, &rdev->nq[i]);
+		struct bnxt_qplib_nq *nq;
+
+		nq = &rdev->nq[i];
+		nq->hwq.max_elements = BNXT_QPLIB_NQE_MAX_CNT;
+		rc = bnxt_qplib_alloc_nq(&rdev->qplib_res, &rdev->nq[i]);
 		if (rc) {
-			dev_err(rdev_to_dev(rdev), "Alloc Failed NQ%d rc:%#x",
-				i, rc);
+			ibdev_err(&rdev->ibdev, "Alloc Failed NQ%d rc:%#x",
+				  i, rc);
 			goto free_nq;
 		}
-		type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
-		pg_map = rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr;
-		pages = rdev->nq[i].hwq.pbl[rdev->nq[i].hwq.level].pg_count;
-		rc = bnxt_re_net_ring_alloc(rdev, pg_map, pages, type,
-					    BNXT_QPLIB_NQE_MAX_CNT - 1,
-					    rdev->msix_entries[i + 1].ring_idx,
-					    &rdev->nq[i].ring_id);
+		type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
+		rattr.dma_arr = nq->hwq.pbl[PBL_LVL_0].pg_map_arr;
+		rattr.pages = nq->hwq.pbl[rdev->nq[i].hwq.level].pg_count;
+		rattr.type = type;
+		rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX;
+		rattr.depth = BNXT_QPLIB_NQE_MAX_CNT - 1;
+		rattr.lrid = rdev->msix_entries[i + 1].ring_idx;
+		rc = bnxt_re_net_ring_alloc(rdev, &rattr, &nq->ring_id);
 		if (rc) {
-			dev_err(rdev_to_dev(rdev),
-				"Failed to allocate NQ fw id with rc = 0x%x",
-				rc);
+			ibdev_err(&rdev->ibdev,
+				  "Failed to allocate NQ fw id with rc = 0x%x",
+				  rc);
 			bnxt_qplib_free_nq(&rdev->nq[i]);
 			goto free_nq;
 		}
@@ -1024,8 +1064,8 @@
 	}
 	return 0;
 free_nq:
-	for (i = num_vec_created; i >= 0; i--) {
-		type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
+	for (i = num_vec_created - 1; i >= 0; i--) {
+		type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
 		bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id, type);
 		bnxt_qplib_free_nq(&rdev->nq[i]);
 	}
@@ -1090,10 +1130,10 @@
 		return rc;
 
 	if (resp.queue_cfg_info) {
-		dev_warn(rdev_to_dev(rdev),
-			 "Asymmetric cos queue configuration detected");
-		dev_warn(rdev_to_dev(rdev),
-			 " on device, QoS may not be fully functional\n");
+		ibdev_warn(&rdev->ibdev,
+			   "Asymmetric cos queue configuration detected");
+		ibdev_warn(&rdev->ibdev,
+			   " on device, QoS may not be fully functional\n");
 	}
 	qcfgmap = &resp.pri0_cos_queue_id;
 	tmp_map = (u8 *)cid_map;
@@ -1106,7 +1146,8 @@
 static bool bnxt_re_is_qp1_or_shadow_qp(struct bnxt_re_dev *rdev,
 					struct bnxt_re_qp *qp)
 {
-	return (qp->ib_qp.qp_type == IB_QPT_GSI) || (qp == rdev->qp1_sqp);
+	return (qp->ib_qp.qp_type == IB_QPT_GSI) ||
+	       (qp == rdev->gsi_ctx.gsi_sqp);
 }
 
 static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev)
@@ -1141,12 +1182,13 @@
 	u16 gid_idx, index;
 	int rc = 0;
 
-	if (!test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags))
+	if (!ib_device_try_get(&rdev->ibdev))
 		return 0;
 
 	if (!sgid_tbl) {
-		dev_err(rdev_to_dev(rdev), "QPLIB: SGID table not allocated");
-		return -EINVAL;
+		ibdev_err(&rdev->ibdev, "QPLIB: SGID table not allocated");
+		rc = -EINVAL;
+		goto out;
 	}
 
 	for (index = 0; index < sgid_tbl->active; index++) {
@@ -1166,7 +1208,8 @@
 		rc = bnxt_qplib_update_sgid(sgid_tbl, &gid, gid_idx,
 					    rdev->qplib_res.netdev->dev_addr);
 	}
-
+out:
+	ib_device_put(&rdev->ibdev);
 	return rc;
 }
 
@@ -1222,7 +1265,7 @@
 	/* Get cosq id for this priority */
 	rc = bnxt_re_query_hwrm_pri2cos(rdev, 0, &cid_map);
 	if (rc) {
-		dev_warn(rdev_to_dev(rdev), "no cos for p_mask %x\n", prio_map);
+		ibdev_warn(&rdev->ibdev, "no cos for p_mask %x\n", prio_map);
 		return rc;
 	}
 	/* Parse CoS IDs for app priority */
@@ -1231,8 +1274,8 @@
 	/* Config BONO. */
 	rc = bnxt_qplib_map_tc2cos(&rdev->qplib_res, rdev->cosq);
 	if (rc) {
-		dev_warn(rdev_to_dev(rdev), "no tc for cos{%x, %x}\n",
-			 rdev->cosq[0], rdev->cosq[1]);
+		ibdev_warn(&rdev->ibdev, "no tc for cos{%x, %x}\n",
+			   rdev->cosq[0], rdev->cosq[1]);
 		return rc;
 	}
 
@@ -1267,8 +1310,8 @@
 			    sizeof(resp), DFLT_HWRM_CMD_TIMEOUT);
 	rc = en_dev->en_ops->bnxt_send_fw_msg(en_dev, BNXT_ROCE_ULP, &fw_msg);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev),
-			"Failed to query HW version, rc = 0x%x", rc);
+		ibdev_err(&rdev->ibdev, "Failed to query HW version, rc = 0x%x",
+			  rc);
 		return;
 	}
 	rdev->qplib_ctx.hwrm_intf_ver =
@@ -1278,15 +1321,35 @@
 		le16_to_cpu(resp.hwrm_intf_patch);
 }
 
-static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev)
+static int bnxt_re_ib_init(struct bnxt_re_dev *rdev)
+{
+	int rc = 0;
+	u32 event;
+
+	/* Register ib dev */
+	rc = bnxt_re_register_ib(rdev);
+	if (rc) {
+		pr_err("Failed to register with IB: %#x\n", rc);
+		return rc;
+	}
+	dev_info(rdev_to_dev(rdev), "Device registered successfully");
+	ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed,
+			 &rdev->active_width);
+	set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags);
+
+	event = netif_running(rdev->netdev) && netif_carrier_ok(rdev->netdev) ?
+		IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+
+	bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, event);
+
+	return rc;
+}
+
+static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev)
 {
 	u8 type;
 	int rc;
 
-	if (test_and_clear_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) {
-		/* Cleanup ib dev */
-		bnxt_re_unregister_ib(rdev);
-	}
 	if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags))
 		cancel_delayed_work_sync(&rdev->worker);
 
@@ -1299,28 +1362,28 @@
 	if (test_and_clear_bit(BNXT_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) {
 		rc = bnxt_qplib_deinit_rcfw(&rdev->rcfw);
 		if (rc)
-			dev_warn(rdev_to_dev(rdev),
-				 "Failed to deinitialize RCFW: %#x", rc);
+			ibdev_warn(&rdev->ibdev,
+				   "Failed to deinitialize RCFW: %#x", rc);
 		bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id);
-		bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx);
+		bnxt_qplib_free_ctx(&rdev->qplib_res, &rdev->qplib_ctx);
 		bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
-		type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
-		bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, type);
+		type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
+		bnxt_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, type);
 		bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
 	}
 	if (test_and_clear_bit(BNXT_RE_FLAG_GOT_MSIX, &rdev->flags)) {
 		rc = bnxt_re_free_msix(rdev);
 		if (rc)
-			dev_warn(rdev_to_dev(rdev),
-				 "Failed to free MSI-X vectors: %#x", rc);
+			ibdev_warn(&rdev->ibdev,
+				   "Failed to free MSI-X vectors: %#x", rc);
 	}
 
 	bnxt_re_destroy_chip_ctx(rdev);
 	if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) {
 		rc = bnxt_re_unregister_netdev(rdev);
 		if (rc)
-			dev_warn(rdev_to_dev(rdev),
-				 "Failed to unregister with netdev: %#x", rc);
+			ibdev_warn(&rdev->ibdev,
+				   "Failed to unregister with netdev: %#x", rc);
 	}
 }
 
@@ -1334,31 +1397,28 @@
 	schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
 }
 
-static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev)
+static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode)
 {
-	dma_addr_t *pg_map;
-	u32 db_offt, ridx;
-	int pages, vid;
-	bool locked;
+	struct bnxt_qplib_creq_ctx *creq;
+	struct bnxt_re_ring_attr rattr;
+	u32 db_offt;
+	int vid;
 	u8 type;
 	int rc;
 
-	/* Acquire rtnl lock through out this function */
-	rtnl_lock();
-	locked = true;
-
 	/* Registered a new RoCE device instance to netdev */
+	memset(&rattr, 0, sizeof(rattr));
 	rc = bnxt_re_register_netdev(rdev);
 	if (rc) {
-		rtnl_unlock();
-		pr_err("Failed to register with netedev: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to register with netedev: %#x\n", rc);
 		return -EINVAL;
 	}
 	set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags);
 
-	rc = bnxt_re_setup_chip_ctx(rdev);
+	rc = bnxt_re_setup_chip_ctx(rdev, wqe_mode);
 	if (rc) {
-		dev_err(rdev_to_dev(rdev), "Failed to get chip context\n");
+		ibdev_err(&rdev->ibdev, "Failed to get chip context\n");
 		return -EINVAL;
 	}
 
@@ -1367,7 +1427,8 @@
 
 	rc = bnxt_re_request_msix(rdev);
 	if (rc) {
-		pr_err("Failed to get MSI-X vectors: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to get MSI-X vectors: %#x\n", rc);
 		rc = -EINVAL;
 		goto fail;
 	}
@@ -1378,31 +1439,36 @@
 	/* Establish RCFW Communication Channel to initialize the context
 	 * memory for the function and all child VFs
 	 */
-	rc = bnxt_qplib_alloc_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw,
+	rc = bnxt_qplib_alloc_rcfw_channel(&rdev->qplib_res, &rdev->rcfw,
 					   &rdev->qplib_ctx,
 					   BNXT_RE_MAX_QPC_COUNT);
 	if (rc) {
-		pr_err("Failed to allocate RCFW Channel: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to allocate RCFW Channel: %#x\n", rc);
 		goto fail;
 	}
-	type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
-	pg_map = rdev->rcfw.creq.pbl[PBL_LVL_0].pg_map_arr;
-	pages = rdev->rcfw.creq.pbl[rdev->rcfw.creq.level].pg_count;
-	ridx = rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx;
-	rc = bnxt_re_net_ring_alloc(rdev, pg_map, pages, type,
-				    BNXT_QPLIB_CREQE_MAX_CNT - 1,
-				    ridx, &rdev->rcfw.creq_ring_id);
+
+	type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
+	creq = &rdev->rcfw.creq;
+	rattr.dma_arr = creq->hwq.pbl[PBL_LVL_0].pg_map_arr;
+	rattr.pages = creq->hwq.pbl[creq->hwq.level].pg_count;
+	rattr.type = type;
+	rattr.mode = RING_ALLOC_REQ_INT_MODE_MSIX;
+	rattr.depth = BNXT_QPLIB_CREQE_MAX_CNT - 1;
+	rattr.lrid = rdev->msix_entries[BNXT_RE_AEQ_IDX].ring_idx;
+	rc = bnxt_re_net_ring_alloc(rdev, &rattr, &creq->ring_id);
 	if (rc) {
-		pr_err("Failed to allocate CREQ: %#x\n", rc);
+		ibdev_err(&rdev->ibdev, "Failed to allocate CREQ: %#x\n", rc);
 		goto free_rcfw;
 	}
 	db_offt = bnxt_re_get_nqdb_offset(rdev, BNXT_RE_AEQ_IDX);
 	vid = rdev->msix_entries[BNXT_RE_AEQ_IDX].vector;
-	rc = bnxt_qplib_enable_rcfw_channel(rdev->en_dev->pdev, &rdev->rcfw,
+	rc = bnxt_qplib_enable_rcfw_channel(&rdev->rcfw,
 					    vid, db_offt, rdev->is_virtfn,
 					    &bnxt_re_aeq_handler);
 	if (rc) {
-		pr_err("Failed to enable RCFW channel: %#x\n", rc);
+		ibdev_err(&rdev->ibdev, "Failed to enable RCFW channel: %#x\n",
+			  rc);
 		goto free_ring;
 	}
 
@@ -1410,27 +1476,30 @@
 				     rdev->is_virtfn);
 	if (rc)
 		goto disable_rcfw;
-	if (!rdev->is_virtfn)
-		bnxt_re_set_resource_limits(rdev);
 
-	rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0,
-				  bnxt_qplib_is_chip_gen_p5(&rdev->chip_ctx));
+	bnxt_re_set_resource_limits(rdev);
+
+	rc = bnxt_qplib_alloc_ctx(&rdev->qplib_res, &rdev->qplib_ctx, 0,
+				  bnxt_qplib_is_chip_gen_p5(rdev->chip_ctx));
 	if (rc) {
-		pr_err("Failed to allocate QPLIB context: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to allocate QPLIB context: %#x\n", rc);
 		goto disable_rcfw;
 	}
 	rc = bnxt_re_net_stats_ctx_alloc(rdev,
 					 rdev->qplib_ctx.stats.dma_map,
 					 &rdev->qplib_ctx.stats.fw_id);
 	if (rc) {
-		pr_err("Failed to allocate stats context: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to allocate stats context: %#x\n", rc);
 		goto free_ctx;
 	}
 
 	rc = bnxt_qplib_init_rcfw(&rdev->rcfw, &rdev->qplib_ctx,
 				  rdev->is_virtfn);
 	if (rc) {
-		pr_err("Failed to initialize RCFW: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to initialize RCFW: %#x\n", rc);
 		goto free_sctx;
 	}
 	set_bit(BNXT_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags);
@@ -1438,13 +1507,15 @@
 	/* Resources based on the 'new' device caps */
 	rc = bnxt_re_alloc_res(rdev);
 	if (rc) {
-		pr_err("Failed to allocate resources: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to allocate resources: %#x\n", rc);
 		goto fail;
 	}
 	set_bit(BNXT_RE_FLAG_RESOURCES_ALLOCATED, &rdev->flags);
 	rc = bnxt_re_init_res(rdev);
 	if (rc) {
-		pr_err("Failed to initialize resources: %#x\n", rc);
+		ibdev_err(&rdev->ibdev,
+			  "Failed to initialize resources: %#x\n", rc);
 		goto fail;
 	}
 
@@ -1453,46 +1524,28 @@
 	if (!rdev->is_virtfn) {
 		rc = bnxt_re_setup_qos(rdev);
 		if (rc)
-			pr_info("RoCE priority not yet configured\n");
+			ibdev_info(&rdev->ibdev,
+				   "RoCE priority not yet configured\n");
 
 		INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker);
 		set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags);
 		schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000));
 	}
 
-	rtnl_unlock();
-	locked = false;
-
-	/* Register ib dev */
-	rc = bnxt_re_register_ib(rdev);
-	if (rc) {
-		pr_err("Failed to register with IB: %#x\n", rc);
-		goto fail;
-	}
-	set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags);
-	dev_info(rdev_to_dev(rdev), "Device registered successfully");
-	ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed,
-			 &rdev->active_width);
-	set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags);
-	bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_PORT_ACTIVE);
-
 	return 0;
 free_sctx:
 	bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id);
 free_ctx:
-	bnxt_qplib_free_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx);
+	bnxt_qplib_free_ctx(&rdev->qplib_res, &rdev->qplib_ctx);
 disable_rcfw:
 	bnxt_qplib_disable_rcfw_channel(&rdev->rcfw);
 free_ring:
-	type = bnxt_qplib_get_ring_type(&rdev->chip_ctx);
-	bnxt_re_net_ring_free(rdev, rdev->rcfw.creq_ring_id, type);
+	type = bnxt_qplib_get_ring_type(rdev->chip_ctx);
+	bnxt_re_net_ring_free(rdev, rdev->rcfw.creq.ring_id, type);
 free_rcfw:
 	bnxt_qplib_free_rcfw_channel(&rdev->rcfw);
 fail:
-	if (!locked)
-		rtnl_lock();
-	bnxt_re_ib_unreg(rdev);
-	rtnl_unlock();
+	bnxt_re_dev_uninit(rdev);
 
 	return rc;
 }
@@ -1519,7 +1572,8 @@
 	en_dev = bnxt_re_dev_probe(netdev);
 	if (IS_ERR(en_dev)) {
 		if (en_dev != ERR_PTR(-ENODEV))
-			pr_err("%s: Failed to probe\n", ROCE_DRV_MODULE_NAME);
+			ibdev_err(&(*rdev)->ibdev, "%s: Failed to probe\n",
+				  ROCE_DRV_MODULE_NAME);
 		rc = PTR_ERR(en_dev);
 		goto exit;
 	}
@@ -1533,9 +1587,47 @@
 	return rc;
 }
 
-static void bnxt_re_remove_one(struct bnxt_re_dev *rdev)
+static void bnxt_re_remove_device(struct bnxt_re_dev *rdev)
 {
+	bnxt_re_dev_uninit(rdev);
 	pci_dev_put(rdev->en_dev->pdev);
+	bnxt_re_dev_unreg(rdev);
+}
+
+static int bnxt_re_add_device(struct bnxt_re_dev **rdev,
+			      struct net_device *netdev, u8 wqe_mode)
+{
+	int rc;
+
+	rc = bnxt_re_dev_reg(rdev, netdev);
+	if (rc == -ENODEV)
+		return rc;
+	if (rc) {
+		pr_err("Failed to register with the device %s: %#x\n",
+		       netdev->name, rc);
+		return rc;
+	}
+
+	pci_dev_get((*rdev)->en_dev->pdev);
+	rc = bnxt_re_dev_init(*rdev, wqe_mode);
+	if (rc) {
+		pci_dev_put((*rdev)->en_dev->pdev);
+		bnxt_re_dev_unreg(*rdev);
+	}
+
+	return rc;
+}
+
+static void bnxt_re_dealloc_driver(struct ib_device *ib_dev)
+{
+	struct bnxt_re_dev *rdev =
+		container_of(ib_dev, struct bnxt_re_dev, ibdev);
+
+	dev_info(rdev_to_dev(rdev), "Unregistering Device");
+
+	rtnl_lock();
+	bnxt_re_remove_device(rdev);
+	rtnl_unlock();
 }
 
 /* Handle all deferred netevents tasks */
@@ -1548,21 +1640,23 @@
 	re_work = container_of(work, struct bnxt_re_work, work);
 	rdev = re_work->rdev;
 
-	if (re_work->event != NETDEV_REGISTER &&
-	    !test_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags))
-		return;
-
-	switch (re_work->event) {
-	case NETDEV_REGISTER:
-		rc = bnxt_re_ib_reg(rdev);
+	if (re_work->event == NETDEV_REGISTER) {
+		rc = bnxt_re_ib_init(rdev);
 		if (rc) {
-			dev_err(rdev_to_dev(rdev),
-				"Failed to register with IB: %#x", rc);
-			bnxt_re_remove_one(rdev);
-			bnxt_re_dev_unreg(rdev);
+			ibdev_err(&rdev->ibdev,
+				  "Failed to register with IB: %#x", rc);
+			rtnl_lock();
+			bnxt_re_remove_device(rdev);
+			rtnl_unlock();
 			goto exit;
 		}
-		break;
+		goto exit;
+	}
+
+	if (!ib_device_try_get(&rdev->ibdev))
+		goto exit;
+
+	switch (re_work->event) {
 	case NETDEV_UP:
 		bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1,
 				       IB_EVENT_PORT_ACTIVE);
@@ -1582,17 +1676,12 @@
 	default:
 		break;
 	}
-	smp_mb__before_atomic();
-	atomic_dec(&rdev->sched_count);
+	ib_device_put(&rdev->ibdev);
 exit:
+	put_device(&rdev->ibdev.dev);
 	kfree(re_work);
 }
 
-static void bnxt_re_init_one(struct bnxt_re_dev *rdev)
-{
-	pci_dev_get(rdev->en_dev->pdev);
-}
-
 /*
  * "Notifier chain callback can be invoked for the same chain from
  * different CPUs at the same time".
@@ -1615,6 +1704,7 @@
 	struct bnxt_re_dev *rdev;
 	int rc = 0;
 	bool sch_work = false;
+	bool release = true;
 
 	real_dev = rdma_vlan_dev_real_dev(netdev);
 	if (!real_dev)
@@ -1622,7 +1712,8 @@
 
 	rdev = bnxt_re_from_netdev(real_dev);
 	if (!rdev && event != NETDEV_REGISTER)
-		goto exit;
+		return NOTIFY_OK;
+
 	if (real_dev != netdev)
 		goto exit;
 
@@ -1630,27 +1721,15 @@
 	case NETDEV_REGISTER:
 		if (rdev)
 			break;
-		rc = bnxt_re_dev_reg(&rdev, real_dev);
-		if (rc == -ENODEV)
-			break;
-		if (rc) {
-			pr_err("Failed to register with the device %s: %#x\n",
-			       real_dev->name, rc);
-			break;
-		}
-		bnxt_re_init_one(rdev);
-		sch_work = true;
+		rc = bnxt_re_add_device(&rdev, real_dev,
+					BNXT_QPLIB_WQE_MODE_STATIC);
+		if (!rc)
+			sch_work = true;
+		release = false;
 		break;
 
 	case NETDEV_UNREGISTER:
-		/* netdev notifier will call NETDEV_UNREGISTER again later since
-		 * we are still holding the reference to the netdev
-		 */
-		if (atomic_read(&rdev->sched_count) > 0)
-			goto exit;
-		bnxt_re_ib_unreg(rdev);
-		bnxt_re_remove_one(rdev);
-		bnxt_re_dev_unreg(rdev);
+		ib_unregister_device_queued(&rdev->ibdev);
 		break;
 
 	default:
@@ -1661,17 +1740,19 @@
 		/* Allocate for the deferred task */
 		re_work = kzalloc(sizeof(*re_work), GFP_ATOMIC);
 		if (re_work) {
+			get_device(&rdev->ibdev.dev);
 			re_work->rdev = rdev;
 			re_work->event = event;
 			re_work->vlan_dev = (real_dev == netdev ?
 					     NULL : netdev);
 			INIT_WORK(&re_work->work, bnxt_re_task);
-			atomic_inc(&rdev->sched_count);
 			queue_work(bnxt_re_wq, &re_work->work);
 		}
 	}
 
 exit:
+	if (rdev && release)
+		ib_device_put(&rdev->ibdev);
 	return NOTIFY_DONE;
 }
 
@@ -1707,36 +1788,21 @@
 
 static void __exit bnxt_re_mod_exit(void)
 {
-	struct bnxt_re_dev *rdev, *next;
-	LIST_HEAD(to_be_deleted);
+	struct bnxt_re_dev *rdev;
 
-	mutex_lock(&bnxt_re_dev_lock);
-	/* Free all adapter allocated resources */
-	if (!list_empty(&bnxt_re_dev_list))
-		list_splice_init(&bnxt_re_dev_list, &to_be_deleted);
-	mutex_unlock(&bnxt_re_dev_lock);
-       /*
-	* Cleanup the devices in reverse order so that the VF device
-	* cleanup is done before PF cleanup
-	*/
-	list_for_each_entry_safe_reverse(rdev, next, &to_be_deleted, list) {
-		dev_info(rdev_to_dev(rdev), "Unregistering Device");
-		/*
-		 * Flush out any scheduled tasks before destroying the
-		 * resources
-		 */
-		flush_workqueue(bnxt_re_wq);
-		bnxt_re_dev_stop(rdev);
-		/* Acquire the rtnl_lock as the L2 resources are freed here */
-		rtnl_lock();
-		bnxt_re_ib_unreg(rdev);
-		rtnl_unlock();
-		bnxt_re_remove_one(rdev);
-		bnxt_re_dev_unreg(rdev);
-	}
 	unregister_netdevice_notifier(&bnxt_re_netdev_notifier);
 	if (bnxt_re_wq)
 		destroy_workqueue(bnxt_re_wq);
+	list_for_each_entry(rdev, &bnxt_re_dev_list, list) {
+		/* VF device removal should be called before the removal
+		 * of PF device. Queue VFs unregister first, so that VFs
+		 * shall be removed before the PF during the call of
+		 * ib_unregister_driver.
+		 */
+		if (rdev->is_virtfn)
+			ib_unregister_device(&rdev->ibdev);
+	}
+	ib_unregister_driver(RDMA_DRIVER_BNXT_RE);
 }
 
 module_init(bnxt_re_mod_init);
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
index 4d07d22..bd153aa 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c
@@ -43,6 +43,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/pci.h>
+#include <linux/delay.h>
 #include <linux/prefetch.h>
 #include <linux/if_ether.h>
 
@@ -53,9 +54,7 @@
 #include "qplib_sp.h"
 #include "qplib_fp.h"
 
-static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq);
 static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp);
-static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type);
 
 static void bnxt_qplib_cancel_phantom_processing(struct bnxt_qplib_qp *qp)
 {
@@ -179,11 +178,11 @@
 
 	if (qp->rq_hdr_buf)
 		dma_free_coherent(&res->pdev->dev,
-				  rq->hwq.max_elements * qp->rq_hdr_buf_size,
+				  rq->max_wqe * qp->rq_hdr_buf_size,
 				  qp->rq_hdr_buf, qp->rq_hdr_buf_map);
 	if (qp->sq_hdr_buf)
 		dma_free_coherent(&res->pdev->dev,
-				  sq->hwq.max_elements * qp->sq_hdr_buf_size,
+				  sq->max_wqe * qp->sq_hdr_buf_size,
 				  qp->sq_hdr_buf, qp->sq_hdr_buf_map);
 	qp->rq_hdr_buf = NULL;
 	qp->sq_hdr_buf = NULL;
@@ -200,10 +199,9 @@
 	struct bnxt_qplib_q *sq = &qp->sq;
 	int rc = 0;
 
-	if (qp->sq_hdr_buf_size && sq->hwq.max_elements) {
+	if (qp->sq_hdr_buf_size && sq->max_wqe) {
 		qp->sq_hdr_buf = dma_alloc_coherent(&res->pdev->dev,
-					sq->hwq.max_elements *
-					qp->sq_hdr_buf_size,
+					sq->max_wqe * qp->sq_hdr_buf_size,
 					&qp->sq_hdr_buf_map, GFP_KERNEL);
 		if (!qp->sq_hdr_buf) {
 			rc = -ENOMEM;
@@ -213,9 +211,9 @@
 		}
 	}
 
-	if (qp->rq_hdr_buf_size && rq->hwq.max_elements) {
+	if (qp->rq_hdr_buf_size && rq->max_wqe) {
 		qp->rq_hdr_buf = dma_alloc_coherent(&res->pdev->dev,
-						    rq->hwq.max_elements *
+						    rq->max_wqe *
 						    qp->rq_hdr_buf_size,
 						    &qp->rq_hdr_buf_map,
 						    GFP_KERNEL);
@@ -233,20 +231,16 @@
 	return rc;
 }
 
-static void bnxt_qplib_service_nq(unsigned long data)
+static void clean_nq(struct bnxt_qplib_nq *nq, struct bnxt_qplib_cq *cq)
 {
-	struct bnxt_qplib_nq *nq = (struct bnxt_qplib_nq *)data;
 	struct bnxt_qplib_hwq *hwq = &nq->hwq;
 	struct nq_base *nqe, **nq_ptr;
-	struct bnxt_qplib_cq *cq;
-	int num_cqne_processed = 0;
-	int num_srqne_processed = 0;
-	u32 sw_cons, raw_cons;
-	u16 type;
 	int budget = nq->budget;
+	u32 sw_cons, raw_cons;
 	uintptr_t q_handle;
-	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
+	u16 type;
 
+	spin_lock_bh(&hwq->lock);
 	/* Service the NQ until empty */
 	raw_cons = hwq->cons;
 	while (budget--) {
@@ -271,8 +265,78 @@
 			q_handle = le32_to_cpu(nqcne->cq_handle_low);
 			q_handle |= (u64)le32_to_cpu(nqcne->cq_handle_high)
 						     << 32;
+			if ((unsigned long)cq == q_handle) {
+				nqcne->cq_handle_low = 0;
+				nqcne->cq_handle_high = 0;
+				cq->cnq_events++;
+			}
+			break;
+		}
+		default:
+			break;
+		}
+		raw_cons++;
+	}
+	spin_unlock_bh(&hwq->lock);
+}
+
+/* Wait for receiving all NQEs for this CQ and clean the NQEs associated with
+ * this CQ.
+ */
+static void __wait_for_all_nqes(struct bnxt_qplib_cq *cq, u16 cnq_events)
+{
+	u32 retry_cnt = 100;
+
+	while (retry_cnt--) {
+		if (cnq_events == cq->cnq_events)
+			return;
+		usleep_range(50, 100);
+		clean_nq(cq->nq, cq);
+	}
+}
+
+static void bnxt_qplib_service_nq(struct tasklet_struct *t)
+{
+	struct bnxt_qplib_nq *nq = from_tasklet(nq, t, nq_tasklet);
+	struct bnxt_qplib_hwq *hwq = &nq->hwq;
+	int num_srqne_processed = 0;
+	int num_cqne_processed = 0;
+	struct bnxt_qplib_cq *cq;
+	int budget = nq->budget;
+	u32 sw_cons, raw_cons;
+	struct nq_base *nqe;
+	uintptr_t q_handle;
+	u16 type;
+
+	spin_lock_bh(&hwq->lock);
+	/* Service the NQ until empty */
+	raw_cons = hwq->cons;
+	while (budget--) {
+		sw_cons = HWQ_CMP(raw_cons, hwq);
+		nqe = bnxt_qplib_get_qe(hwq, sw_cons, NULL);
+		if (!NQE_CMP_VALID(nqe, raw_cons, hwq->max_elements))
+			break;
+
+		/*
+		 * The valid test of the entry must be done first before
+		 * reading any further.
+		 */
+		dma_rmb();
+
+		type = le16_to_cpu(nqe->info10_type) & NQ_BASE_TYPE_MASK;
+		switch (type) {
+		case NQ_BASE_TYPE_CQ_NOTIFICATION:
+		{
+			struct nq_cn *nqcne = (struct nq_cn *)nqe;
+
+			q_handle = le32_to_cpu(nqcne->cq_handle_low);
+			q_handle |= (u64)le32_to_cpu(nqcne->cq_handle_high)
+						     << 32;
 			cq = (struct bnxt_qplib_cq *)(unsigned long)q_handle;
-			bnxt_qplib_arm_cq_enable(cq);
+			if (!cq)
+				break;
+			bnxt_qplib_armen_db(&cq->dbinfo,
+					    DBC_DBC_TYPE_CQ_ARMENA);
 			spin_lock_bh(&cq->compl_lock);
 			atomic_set(&cq->arm_state, 0);
 			if (!nq->cqn_handler(nq, (cq)))
@@ -280,19 +344,22 @@
 			else
 				dev_warn(&nq->pdev->dev,
 					 "cqn - type 0x%x not handled\n", type);
+			cq->cnq_events++;
 			spin_unlock_bh(&cq->compl_lock);
 			break;
 		}
 		case NQ_BASE_TYPE_SRQ_EVENT:
 		{
+			struct bnxt_qplib_srq *srq;
 			struct nq_srq_event *nqsrqe =
 						(struct nq_srq_event *)nqe;
 
 			q_handle = le32_to_cpu(nqsrqe->srq_handle_low);
 			q_handle |= (u64)le32_to_cpu(nqsrqe->srq_handle_high)
 				     << 32;
-			bnxt_qplib_arm_srq((struct bnxt_qplib_srq *)q_handle,
-					   DBC_DBC_TYPE_SRQ_ARMENA);
+			srq = (struct bnxt_qplib_srq *)q_handle;
+			bnxt_qplib_armen_db(&srq->dbinfo,
+					    DBC_DBC_TYPE_SRQ_ARMENA);
 			if (!nq->srqn_handler(nq,
 					      (struct bnxt_qplib_srq *)q_handle,
 					      nqsrqe->event))
@@ -314,44 +381,39 @@
 	}
 	if (hwq->cons != raw_cons) {
 		hwq->cons = raw_cons;
-		bnxt_qplib_ring_nq_db_rearm(nq->bar_reg_iomem, hwq->cons,
-					    hwq->max_elements, nq->ring_id,
-					    gen_p5);
+		bnxt_qplib_ring_nq_db(&nq->nq_db.dbinfo, nq->res->cctx, true);
 	}
+	spin_unlock_bh(&hwq->lock);
 }
 
 static irqreturn_t bnxt_qplib_nq_irq(int irq, void *dev_instance)
 {
 	struct bnxt_qplib_nq *nq = dev_instance;
 	struct bnxt_qplib_hwq *hwq = &nq->hwq;
-	struct nq_base **nq_ptr;
 	u32 sw_cons;
 
 	/* Prefetch the NQ element */
 	sw_cons = HWQ_CMP(hwq->cons, hwq);
-	nq_ptr = (struct nq_base **)nq->hwq.pbl_ptr;
-	prefetch(&nq_ptr[NQE_PG(sw_cons)][NQE_IDX(sw_cons)]);
+	prefetch(bnxt_qplib_get_qe(hwq, sw_cons, NULL));
 
 	/* Fan out to CPU affinitized kthreads? */
-	tasklet_schedule(&nq->worker);
+	tasklet_schedule(&nq->nq_tasklet);
 
 	return IRQ_HANDLED;
 }
 
 void bnxt_qplib_nq_stop_irq(struct bnxt_qplib_nq *nq, bool kill)
 {
-	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
-	tasklet_disable(&nq->worker);
+	tasklet_disable(&nq->nq_tasklet);
 	/* Mask h/w interrupt */
-	bnxt_qplib_ring_nq_db(nq->bar_reg_iomem, nq->hwq.cons,
-			      nq->hwq.max_elements, nq->ring_id, gen_p5);
+	bnxt_qplib_ring_nq_db(&nq->nq_db.dbinfo, nq->res->cctx, false);
 	/* Sync with last running IRQ handler */
-	synchronize_irq(nq->vector);
+	synchronize_irq(nq->msix_vec);
 	if (kill)
-		tasklet_kill(&nq->worker);
+		tasklet_kill(&nq->nq_tasklet);
 	if (nq->requested) {
-		irq_set_affinity_hint(nq->vector, NULL);
-		free_irq(nq->vector, nq);
+		irq_set_affinity_hint(nq->msix_vec, NULL);
+		free_irq(nq->msix_vec, nq);
 		nq->requested = false;
 	}
 }
@@ -364,89 +426,107 @@
 	}
 
 	/* Make sure the HW is stopped! */
-	if (nq->requested)
-		bnxt_qplib_nq_stop_irq(nq, true);
+	bnxt_qplib_nq_stop_irq(nq, true);
 
-	if (nq->bar_reg_iomem)
-		iounmap(nq->bar_reg_iomem);
-	nq->bar_reg_iomem = NULL;
+	if (nq->nq_db.reg.bar_reg) {
+		iounmap(nq->nq_db.reg.bar_reg);
+		nq->nq_db.reg.bar_reg = NULL;
+	}
 
 	nq->cqn_handler = NULL;
 	nq->srqn_handler = NULL;
-	nq->vector = 0;
+	nq->msix_vec = 0;
 }
 
 int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx,
 			    int msix_vector, bool need_init)
 {
-	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(nq->res->cctx);
 	int rc;
 
 	if (nq->requested)
 		return -EFAULT;
 
-	nq->vector = msix_vector;
+	nq->msix_vec = msix_vector;
 	if (need_init)
-		tasklet_init(&nq->worker, bnxt_qplib_service_nq,
-			     (unsigned long)nq);
+		tasklet_setup(&nq->nq_tasklet, bnxt_qplib_service_nq);
 	else
-		tasklet_enable(&nq->worker);
+		tasklet_enable(&nq->nq_tasklet);
 
 	snprintf(nq->name, sizeof(nq->name), "bnxt_qplib_nq-%d", nq_indx);
-	rc = request_irq(nq->vector, bnxt_qplib_nq_irq, 0, nq->name, nq);
+	rc = request_irq(nq->msix_vec, bnxt_qplib_nq_irq, 0, nq->name, nq);
 	if (rc)
 		return rc;
 
 	cpumask_clear(&nq->mask);
 	cpumask_set_cpu(nq_indx, &nq->mask);
-	rc = irq_set_affinity_hint(nq->vector, &nq->mask);
+	rc = irq_set_affinity_hint(nq->msix_vec, &nq->mask);
 	if (rc) {
 		dev_warn(&nq->pdev->dev,
 			 "set affinity failed; vector: %d nq_idx: %d\n",
-			 nq->vector, nq_indx);
+			 nq->msix_vec, nq_indx);
 	}
 	nq->requested = true;
-	bnxt_qplib_ring_nq_db_rearm(nq->bar_reg_iomem, nq->hwq.cons,
-				    nq->hwq.max_elements, nq->ring_id, gen_p5);
+	bnxt_qplib_ring_nq_db(&nq->nq_db.dbinfo, nq->res->cctx, true);
 
 	return rc;
 }
 
+static int bnxt_qplib_map_nq_db(struct bnxt_qplib_nq *nq,  u32 reg_offt)
+{
+	resource_size_t reg_base;
+	struct bnxt_qplib_nq_db *nq_db;
+	struct pci_dev *pdev;
+	int rc = 0;
+
+	pdev = nq->pdev;
+	nq_db = &nq->nq_db;
+
+	nq_db->reg.bar_id = NQ_CONS_PCI_BAR_REGION;
+	nq_db->reg.bar_base = pci_resource_start(pdev, nq_db->reg.bar_id);
+	if (!nq_db->reg.bar_base) {
+		dev_err(&pdev->dev, "QPLIB: NQ BAR region %d resc start is 0!",
+			nq_db->reg.bar_id);
+		rc = -ENOMEM;
+		goto fail;
+	}
+
+	reg_base = nq_db->reg.bar_base + reg_offt;
+	/* Unconditionally map 8 bytes to support 57500 series */
+	nq_db->reg.len = 8;
+	nq_db->reg.bar_reg = ioremap(reg_base, nq_db->reg.len);
+	if (!nq_db->reg.bar_reg) {
+		dev_err(&pdev->dev, "QPLIB: NQ BAR region %d mapping failed",
+			nq_db->reg.bar_id);
+		rc = -ENOMEM;
+		goto fail;
+	}
+
+	nq_db->dbinfo.db = nq_db->reg.bar_reg;
+	nq_db->dbinfo.hwq = &nq->hwq;
+	nq_db->dbinfo.xid = nq->ring_id;
+fail:
+	return rc;
+}
+
 int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
 			 int nq_idx, int msix_vector, int bar_reg_offset,
-			 int (*cqn_handler)(struct bnxt_qplib_nq *nq,
-					    struct bnxt_qplib_cq *),
-			 int (*srqn_handler)(struct bnxt_qplib_nq *nq,
-					     struct bnxt_qplib_srq *,
-					     u8 event))
+			 cqn_handler_t cqn_handler,
+			 srqn_handler_t srqn_handler)
 {
-	resource_size_t nq_base;
 	int rc = -1;
 
-	if (cqn_handler)
-		nq->cqn_handler = cqn_handler;
-
-	if (srqn_handler)
-		nq->srqn_handler = srqn_handler;
+	nq->pdev = pdev;
+	nq->cqn_handler = cqn_handler;
+	nq->srqn_handler = srqn_handler;
 
 	/* Have a task to schedule CQ notifiers in post send case */
 	nq->cqn_wq  = create_singlethread_workqueue("bnxt_qplib_nq");
 	if (!nq->cqn_wq)
 		return -ENOMEM;
 
-	nq->bar_reg = NQ_CONS_PCI_BAR_REGION;
-	nq->bar_reg_off = bar_reg_offset;
-	nq_base = pci_resource_start(pdev, nq->bar_reg);
-	if (!nq_base) {
-		rc = -ENOMEM;
+	rc = bnxt_qplib_map_nq_db(nq, bar_reg_offset);
+	if (rc)
 		goto fail;
-	}
-	/* Unconditionally map 8 bytes to support 57500 series */
-	nq->bar_reg_iomem = ioremap_nocache(nq_base + nq->bar_reg_off, 8);
-	if (!nq->bar_reg_iomem) {
-		rc = -ENOMEM;
-		goto fail;
-	}
 
 	rc = bnxt_qplib_nq_start_irq(nq, nq_idx, msix_vector, true);
 	if (rc) {
@@ -464,49 +544,38 @@
 void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq)
 {
 	if (nq->hwq.max_elements) {
-		bnxt_qplib_free_hwq(nq->pdev, &nq->hwq);
+		bnxt_qplib_free_hwq(nq->res, &nq->hwq);
 		nq->hwq.max_elements = 0;
 	}
 }
 
-int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq)
+int bnxt_qplib_alloc_nq(struct bnxt_qplib_res *res, struct bnxt_qplib_nq *nq)
 {
-	u8 hwq_type;
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
+	struct bnxt_qplib_sg_info sginfo = {};
 
-	nq->pdev = pdev;
+	nq->pdev = res->pdev;
+	nq->res = res;
 	if (!nq->hwq.max_elements ||
 	    nq->hwq.max_elements > BNXT_QPLIB_NQE_MAX_CNT)
 		nq->hwq.max_elements = BNXT_QPLIB_NQE_MAX_CNT;
-	hwq_type = bnxt_qplib_get_hwq_type(nq->res);
-	if (bnxt_qplib_alloc_init_hwq(nq->pdev, &nq->hwq, NULL,
-				      &nq->hwq.max_elements,
-				      BNXT_QPLIB_MAX_NQE_ENTRY_SIZE, 0,
-				      PAGE_SIZE, hwq_type))
-		return -ENOMEM;
 
+	sginfo.pgsize = PAGE_SIZE;
+	sginfo.pgshft = PAGE_SHIFT;
+	hwq_attr.res = res;
+	hwq_attr.sginfo = &sginfo;
+	hwq_attr.depth = nq->hwq.max_elements;
+	hwq_attr.stride = sizeof(struct nq_base);
+	hwq_attr.type = bnxt_qplib_get_hwq_type(nq->res);
+	if (bnxt_qplib_alloc_init_hwq(&nq->hwq, &hwq_attr)) {
+		dev_err(&nq->pdev->dev, "FP NQ allocation failed");
+		return -ENOMEM;
+	}
 	nq->budget = 8;
 	return 0;
 }
 
 /* SRQ */
-static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type)
-{
-	struct bnxt_qplib_hwq *srq_hwq = &srq->hwq;
-	void __iomem *db;
-	u32 sw_prod;
-	u64 val = 0;
-
-	/* Ring DB */
-	sw_prod = (arm_type == DBC_DBC_TYPE_SRQ_ARM) ?
-		   srq->threshold : HWQ_CMP(srq_hwq->prod, srq_hwq);
-	db = (arm_type == DBC_DBC_TYPE_SRQ_ARMENA) ? srq->dbr_base :
-						     srq->dpi->dbr;
-	val = ((srq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) | arm_type;
-	val <<= 32;
-	val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
-	writeq(val, db);
-}
-
 void bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
 			   struct bnxt_qplib_srq *srq)
 {
@@ -526,24 +595,27 @@
 	kfree(srq->swq);
 	if (rc)
 		return;
-	bnxt_qplib_free_hwq(res->pdev, &srq->hwq);
+	bnxt_qplib_free_hwq(res, &srq->hwq);
 }
 
 int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
 			  struct bnxt_qplib_srq *srq)
 {
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
-	struct cmdq_create_srq req;
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
 	struct creq_create_srq_resp resp;
+	struct cmdq_create_srq req;
 	struct bnxt_qplib_pbl *pbl;
 	u16 cmd_flags = 0;
+	u16 pg_sz_lvl;
 	int rc, idx;
 
-	srq->hwq.max_elements = srq->max_wqe;
-	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &srq->hwq, &srq->sg_info,
-				       &srq->hwq.max_elements,
-				       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+	hwq_attr.res = res;
+	hwq_attr.sginfo = &srq->sg_info;
+	hwq_attr.depth = srq->max_wqe;
+	hwq_attr.stride = srq->wqe_size;
+	hwq_attr.type = HWQ_TYPE_QUEUE;
+	rc = bnxt_qplib_alloc_init_hwq(&srq->hwq, &hwq_attr);
 	if (rc)
 		goto exit;
 
@@ -562,22 +634,11 @@
 
 	req.srq_size = cpu_to_le16((u16)srq->hwq.max_elements);
 	pbl = &srq->hwq.pbl[PBL_LVL_0];
-	req.pg_size_lvl = cpu_to_le16((((u16)srq->hwq.level &
-				      CMDQ_CREATE_SRQ_LVL_MASK) <<
-				      CMDQ_CREATE_SRQ_LVL_SFT) |
-				      (pbl->pg_size == ROCE_PG_SIZE_4K ?
-				       CMDQ_CREATE_SRQ_PG_SIZE_PG_4K :
-				       pbl->pg_size == ROCE_PG_SIZE_8K ?
-				       CMDQ_CREATE_SRQ_PG_SIZE_PG_8K :
-				       pbl->pg_size == ROCE_PG_SIZE_64K ?
-				       CMDQ_CREATE_SRQ_PG_SIZE_PG_64K :
-				       pbl->pg_size == ROCE_PG_SIZE_2M ?
-				       CMDQ_CREATE_SRQ_PG_SIZE_PG_2M :
-				       pbl->pg_size == ROCE_PG_SIZE_8M ?
-				       CMDQ_CREATE_SRQ_PG_SIZE_PG_8M :
-				       pbl->pg_size == ROCE_PG_SIZE_1G ?
-				       CMDQ_CREATE_SRQ_PG_SIZE_PG_1G :
-				       CMDQ_CREATE_SRQ_PG_SIZE_PG_4K));
+	pg_sz_lvl = ((u16)bnxt_qplib_base_pg_size(&srq->hwq) <<
+		     CMDQ_CREATE_SRQ_PG_SIZE_SFT);
+	pg_sz_lvl |= (srq->hwq.level & CMDQ_CREATE_SRQ_LVL_MASK) <<
+		      CMDQ_CREATE_SRQ_LVL_SFT;
+	req.pg_size_lvl = cpu_to_le16(pg_sz_lvl);
 	req.pbl = cpu_to_le64(pbl->pg_map_arr[0]);
 	req.pd_id = cpu_to_le32(srq->pd->id);
 	req.eventq_id = cpu_to_le16(srq->eventq_hw_ring_id);
@@ -595,14 +656,18 @@
 	srq->swq[srq->last_idx].next_idx = -1;
 
 	srq->id = le32_to_cpu(resp.xid);
-	srq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem;
+	srq->dbinfo.hwq = &srq->hwq;
+	srq->dbinfo.xid = srq->id;
+	srq->dbinfo.db = srq->dpi->dbr;
+	srq->dbinfo.max_slot = 1;
+	srq->dbinfo.priv_db = res->dpi_tbl.dbr_bar_reg_iomem;
 	if (srq->threshold)
-		bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARMENA);
+		bnxt_qplib_armen_db(&srq->dbinfo, DBC_DBC_TYPE_SRQ_ARMENA);
 	srq->arm_req = false;
 
 	return 0;
 fail:
-	bnxt_qplib_free_hwq(res->pdev, &srq->hwq);
+	bnxt_qplib_free_hwq(res, &srq->hwq);
 	kfree(srq->swq);
 exit:
 	return rc;
@@ -621,7 +686,7 @@
 				    srq_hwq->max_elements - sw_cons + sw_prod;
 	if (count > srq->threshold) {
 		srq->arm_req = false;
-		bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARM);
+		bnxt_qplib_srq_arm_db(&srq->dbinfo, srq->threshold);
 	} else {
 		/* Deferred arming */
 		srq->arm_req = true;
@@ -642,12 +707,13 @@
 	int rc = 0;
 
 	RCFW_CMD_PREP(req, QUERY_SRQ, cmd_flags);
-	req.srq_cid = cpu_to_le32(srq->id);
 
 	/* Configure the request */
 	sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb));
 	if (!sbuf)
 		return -ENOMEM;
+	req.resp_size = sizeof(*sb) / BNXT_QPLIB_CMDQE_UNITS;
+	req.srq_cid = cpu_to_le32(srq->id);
 	sb = sbuf->sb;
 	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
 					  (void *)sbuf, 0);
@@ -661,7 +727,7 @@
 			     struct bnxt_qplib_swqe *wqe)
 {
 	struct bnxt_qplib_hwq *srq_hwq = &srq->hwq;
-	struct rq_wqe *srqe, **srqe_ptr;
+	struct rq_wqe *srqe;
 	struct sq_sge *hw_sge;
 	u32 sw_prod, sw_cons, count = 0;
 	int i, rc = 0, next;
@@ -679,9 +745,8 @@
 	spin_unlock(&srq_hwq->lock);
 
 	sw_prod = HWQ_CMP(srq_hwq->prod, srq_hwq);
-	srqe_ptr = (struct rq_wqe **)srq_hwq->pbl_ptr;
-	srqe = &srqe_ptr[RQE_PG(sw_prod)][RQE_IDX(sw_prod)];
-	memset(srqe, 0, BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
+	srqe = bnxt_qplib_get_qe(srq_hwq, sw_prod, NULL);
+	memset(srqe, 0, srq->wqe_size);
 	/* Calculate wqe_size16 and data_len */
 	for (i = 0, hw_sge = (struct sq_sge *)srqe->data;
 	     i < wqe->num_sge; i++, hw_sge++) {
@@ -709,27 +774,52 @@
 				    srq_hwq->max_elements - sw_cons + sw_prod;
 	spin_unlock(&srq_hwq->lock);
 	/* Ring DB */
-	bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ);
+	bnxt_qplib_ring_prod_db(&srq->dbinfo, DBC_DBC_TYPE_SRQ);
 	if (srq->arm_req == true && count > srq->threshold) {
 		srq->arm_req = false;
-		bnxt_qplib_arm_srq(srq, DBC_DBC_TYPE_SRQ_ARM);
+		bnxt_qplib_srq_arm_db(&srq->dbinfo, srq->threshold);
 	}
 done:
 	return rc;
 }
 
 /* QP */
+
+static int bnxt_qplib_alloc_init_swq(struct bnxt_qplib_q *que)
+{
+	int rc = 0;
+	int indx;
+
+	que->swq = kcalloc(que->max_wqe, sizeof(*que->swq), GFP_KERNEL);
+	if (!que->swq) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	que->swq_start = 0;
+	que->swq_last = que->max_wqe - 1;
+	for (indx = 0; indx < que->max_wqe; indx++)
+		que->swq[indx].next_idx = indx + 1;
+	que->swq[que->swq_last].next_idx = 0; /* Make it circular */
+	que->swq_last = 0;
+out:
+	return rc;
+}
+
 int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 {
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
-	struct cmdq_create_qp1 req;
-	struct creq_create_qp1_resp resp;
-	struct bnxt_qplib_pbl *pbl;
 	struct bnxt_qplib_q *sq = &qp->sq;
 	struct bnxt_qplib_q *rq = &qp->rq;
-	int rc;
+	struct creq_create_qp1_resp resp;
+	struct cmdq_create_qp1 req;
+	struct bnxt_qplib_pbl *pbl;
 	u16 cmd_flags = 0;
 	u32 qp_flags = 0;
+	u8 pg_sz_lvl;
+	u32 tbl_indx;
+	int rc;
 
 	RCFW_CMD_PREP(req, CREATE_QP1, cmd_flags);
 
@@ -739,98 +829,65 @@
 	req.qp_handle = cpu_to_le64(qp->qp_handle);
 
 	/* SQ */
-	sq->hwq.max_elements = sq->max_wqe;
-	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, NULL,
-				       &sq->hwq.max_elements,
-				       BNXT_QPLIB_MAX_SQE_ENTRY_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+	hwq_attr.res = res;
+	hwq_attr.sginfo = &sq->sg_info;
+	hwq_attr.stride = sizeof(struct sq_sge);
+	hwq_attr.depth = bnxt_qplib_get_depth(sq);
+	hwq_attr.type = HWQ_TYPE_QUEUE;
+	rc = bnxt_qplib_alloc_init_hwq(&sq->hwq, &hwq_attr);
 	if (rc)
 		goto exit;
 
-	sq->swq = kcalloc(sq->hwq.max_elements, sizeof(*sq->swq), GFP_KERNEL);
-	if (!sq->swq) {
-		rc = -ENOMEM;
+	rc = bnxt_qplib_alloc_init_swq(sq);
+	if (rc)
 		goto fail_sq;
-	}
+
+	req.sq_size = cpu_to_le32(bnxt_qplib_set_sq_size(sq, qp->wqe_mode));
 	pbl = &sq->hwq.pbl[PBL_LVL_0];
 	req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
-	req.sq_pg_size_sq_lvl =
-		((sq->hwq.level & CMDQ_CREATE_QP1_SQ_LVL_MASK)
-				<<  CMDQ_CREATE_QP1_SQ_LVL_SFT) |
-		(pbl->pg_size == ROCE_PG_SIZE_4K ?
-				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_4K :
-		 pbl->pg_size == ROCE_PG_SIZE_8K ?
-				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_8K :
-		 pbl->pg_size == ROCE_PG_SIZE_64K ?
-				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_64K :
-		 pbl->pg_size == ROCE_PG_SIZE_2M ?
-				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_2M :
-		 pbl->pg_size == ROCE_PG_SIZE_8M ?
-				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_8M :
-		 pbl->pg_size == ROCE_PG_SIZE_1G ?
-				CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_1G :
-		 CMDQ_CREATE_QP1_SQ_PG_SIZE_PG_4K);
-
-	if (qp->scq)
-		req.scq_cid = cpu_to_le32(qp->scq->id);
-
-	qp_flags |= CMDQ_CREATE_QP1_QP_FLAGS_RESERVED_LKEY_ENABLE;
+	pg_sz_lvl = (bnxt_qplib_base_pg_size(&sq->hwq) <<
+		     CMDQ_CREATE_QP1_SQ_PG_SIZE_SFT);
+	pg_sz_lvl |= (sq->hwq.level & CMDQ_CREATE_QP1_SQ_LVL_MASK);
+	req.sq_pg_size_sq_lvl = pg_sz_lvl;
+	req.sq_fwo_sq_sge =
+		cpu_to_le16((sq->max_sge & CMDQ_CREATE_QP1_SQ_SGE_MASK) <<
+			     CMDQ_CREATE_QP1_SQ_SGE_SFT);
+	req.scq_cid = cpu_to_le32(qp->scq->id);
 
 	/* RQ */
 	if (rq->max_wqe) {
-		rq->hwq.max_elements = qp->rq.max_wqe;
-		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq, NULL,
-					       &rq->hwq.max_elements,
-					       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
-					       PAGE_SIZE, HWQ_TYPE_QUEUE);
+		hwq_attr.res = res;
+		hwq_attr.sginfo = &rq->sg_info;
+		hwq_attr.stride = sizeof(struct sq_sge);
+		hwq_attr.depth = bnxt_qplib_get_depth(rq);
+		hwq_attr.type = HWQ_TYPE_QUEUE;
+		rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr);
 		if (rc)
-			goto fail_sq;
-
-		rq->swq = kcalloc(rq->hwq.max_elements, sizeof(*rq->swq),
-				  GFP_KERNEL);
-		if (!rq->swq) {
-			rc = -ENOMEM;
+			goto sq_swq;
+		rc = bnxt_qplib_alloc_init_swq(rq);
+		if (rc)
 			goto fail_rq;
-		}
+		req.rq_size = cpu_to_le32(rq->max_wqe);
 		pbl = &rq->hwq.pbl[PBL_LVL_0];
 		req.rq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
-		req.rq_pg_size_rq_lvl =
-			((rq->hwq.level & CMDQ_CREATE_QP1_RQ_LVL_MASK) <<
-			 CMDQ_CREATE_QP1_RQ_LVL_SFT) |
-				(pbl->pg_size == ROCE_PG_SIZE_4K ?
-					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_4K :
-				 pbl->pg_size == ROCE_PG_SIZE_8K ?
-					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_8K :
-				 pbl->pg_size == ROCE_PG_SIZE_64K ?
-					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_64K :
-				 pbl->pg_size == ROCE_PG_SIZE_2M ?
-					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_2M :
-				 pbl->pg_size == ROCE_PG_SIZE_8M ?
-					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_8M :
-				 pbl->pg_size == ROCE_PG_SIZE_1G ?
-					CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_1G :
-				 CMDQ_CREATE_QP1_RQ_PG_SIZE_PG_4K);
-		if (qp->rcq)
-			req.rcq_cid = cpu_to_le32(qp->rcq->id);
+		pg_sz_lvl = (bnxt_qplib_base_pg_size(&rq->hwq) <<
+			     CMDQ_CREATE_QP1_RQ_PG_SIZE_SFT);
+		pg_sz_lvl |= (rq->hwq.level & CMDQ_CREATE_QP1_RQ_LVL_MASK);
+		req.rq_pg_size_rq_lvl = pg_sz_lvl;
+		req.rq_fwo_rq_sge =
+			cpu_to_le16((rq->max_sge &
+				     CMDQ_CREATE_QP1_RQ_SGE_MASK) <<
+				    CMDQ_CREATE_QP1_RQ_SGE_SFT);
 	}
-
+	req.rcq_cid = cpu_to_le32(qp->rcq->id);
 	/* Header buffer - allow hdr_buf pass in */
 	rc = bnxt_qplib_alloc_qp_hdr_buf(res, qp);
 	if (rc) {
 		rc = -ENOMEM;
-		goto fail;
+		goto rq_rwq;
 	}
+	qp_flags |= CMDQ_CREATE_QP1_QP_FLAGS_RESERVED_LKEY_ENABLE;
 	req.qp_flags = cpu_to_le32(qp_flags);
-	req.sq_size = cpu_to_le32(sq->hwq.max_elements);
-	req.rq_size = cpu_to_le32(rq->hwq.max_elements);
-
-	req.sq_fwo_sq_sge =
-		cpu_to_le16((sq->max_sge & CMDQ_CREATE_QP1_SQ_SGE_MASK) <<
-			    CMDQ_CREATE_QP1_SQ_SGE_SFT);
-	req.rq_fwo_rq_sge =
-		cpu_to_le16((rq->max_sge & CMDQ_CREATE_QP1_RQ_SGE_MASK) <<
-			    CMDQ_CREATE_QP1_RQ_SGE_SFT);
-
 	req.pd_id = cpu_to_le32(qp->pd->id);
 
 	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
@@ -840,39 +897,72 @@
 
 	qp->id = le32_to_cpu(resp.xid);
 	qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
-	rcfw->qp_tbl[qp->id].qp_id = qp->id;
-	rcfw->qp_tbl[qp->id].qp_handle = (void *)qp;
+	qp->cctx = res->cctx;
+	sq->dbinfo.hwq = &sq->hwq;
+	sq->dbinfo.xid = qp->id;
+	sq->dbinfo.db = qp->dpi->dbr;
+	sq->dbinfo.max_slot = bnxt_qplib_set_sq_max_slot(qp->wqe_mode);
+	if (rq->max_wqe) {
+		rq->dbinfo.hwq = &rq->hwq;
+		rq->dbinfo.xid = qp->id;
+		rq->dbinfo.db = qp->dpi->dbr;
+		rq->dbinfo.max_slot = bnxt_qplib_set_rq_max_slot(rq->wqe_size);
+	}
+	tbl_indx = map_qp_id_to_tbl_indx(qp->id, rcfw);
+	rcfw->qp_tbl[tbl_indx].qp_id = qp->id;
+	rcfw->qp_tbl[tbl_indx].qp_handle = (void *)qp;
 
 	return 0;
 
 fail:
 	bnxt_qplib_free_qp_hdr_buf(res, qp);
-fail_rq:
-	bnxt_qplib_free_hwq(res->pdev, &rq->hwq);
+rq_rwq:
 	kfree(rq->swq);
-fail_sq:
-	bnxt_qplib_free_hwq(res->pdev, &sq->hwq);
+fail_rq:
+	bnxt_qplib_free_hwq(res, &rq->hwq);
+sq_swq:
 	kfree(sq->swq);
+fail_sq:
+	bnxt_qplib_free_hwq(res, &sq->hwq);
 exit:
 	return rc;
 }
 
+static void bnxt_qplib_init_psn_ptr(struct bnxt_qplib_qp *qp, int size)
+{
+	struct bnxt_qplib_hwq *hwq;
+	struct bnxt_qplib_q *sq;
+	u64 fpsne, psn_pg;
+	u16 indx_pad = 0;
+
+	sq = &qp->sq;
+	hwq = &sq->hwq;
+	/* First psn entry */
+	fpsne = (u64)bnxt_qplib_get_qe(hwq, hwq->depth, &psn_pg);
+	if (!IS_ALIGNED(fpsne, PAGE_SIZE))
+		indx_pad = (fpsne & ~PAGE_MASK) / size;
+	hwq->pad_pgofft = indx_pad;
+	hwq->pad_pg = (u64 *)psn_pg;
+	hwq->pad_stride = size;
+}
+
 int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 {
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
-	unsigned long int psn_search, poff = 0;
-	struct sq_psn_search **psn_search_ptr;
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
+	struct bnxt_qplib_sg_info sginfo = {};
 	struct bnxt_qplib_q *sq = &qp->sq;
 	struct bnxt_qplib_q *rq = &qp->rq;
-	int i, rc, req_size, psn_sz = 0;
-	struct sq_send **hw_sq_send_ptr;
 	struct creq_create_qp_resp resp;
+	int rc, req_size, psn_sz = 0;
 	struct bnxt_qplib_hwq *xrrq;
-	u16 cmd_flags = 0, max_ssge;
-	struct cmdq_create_qp req;
 	struct bnxt_qplib_pbl *pbl;
+	struct cmdq_create_qp req;
+	u16 cmd_flags = 0;
 	u32 qp_flags = 0;
-	u16 max_rsge;
+	u8 pg_sz_lvl;
+	u32 tbl_indx;
+	u16 nsge;
 
 	RCFW_CMD_PREP(req, CREATE_QP, cmd_flags);
 
@@ -887,140 +977,81 @@
 			 sizeof(struct sq_psn_search_ext) :
 			 sizeof(struct sq_psn_search);
 	}
-	sq->hwq.max_elements = sq->max_wqe;
-	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, &sq->sg_info,
-				       &sq->hwq.max_elements,
-				       BNXT_QPLIB_MAX_SQE_ENTRY_SIZE,
-				       psn_sz,
-				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+
+	hwq_attr.res = res;
+	hwq_attr.sginfo = &sq->sg_info;
+	hwq_attr.stride = sizeof(struct sq_sge);
+	hwq_attr.depth = bnxt_qplib_get_depth(sq);
+	hwq_attr.aux_stride = psn_sz;
+	hwq_attr.aux_depth = bnxt_qplib_set_sq_size(sq, qp->wqe_mode);
+	hwq_attr.type = HWQ_TYPE_QUEUE;
+	rc = bnxt_qplib_alloc_init_hwq(&sq->hwq, &hwq_attr);
 	if (rc)
 		goto exit;
 
-	sq->swq = kcalloc(sq->hwq.max_elements, sizeof(*sq->swq), GFP_KERNEL);
-	if (!sq->swq) {
-		rc = -ENOMEM;
+	rc = bnxt_qplib_alloc_init_swq(sq);
+	if (rc)
 		goto fail_sq;
-	}
-	hw_sq_send_ptr = (struct sq_send **)sq->hwq.pbl_ptr;
-	if (psn_sz) {
-		psn_search_ptr = (struct sq_psn_search **)
-				  &hw_sq_send_ptr[get_sqe_pg
-					(sq->hwq.max_elements)];
-		psn_search = (unsigned long int)
-			      &hw_sq_send_ptr[get_sqe_pg(sq->hwq.max_elements)]
-			      [get_sqe_idx(sq->hwq.max_elements)];
-		if (psn_search & ~PAGE_MASK) {
-			/* If the psn_search does not start on a page boundary,
-			 * then calculate the offset
-			 */
-			poff = (psn_search & ~PAGE_MASK) /
-				BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE;
-		}
-		for (i = 0; i < sq->hwq.max_elements; i++) {
-			sq->swq[i].psn_search =
-				&psn_search_ptr[get_psne_pg(i + poff)]
-					       [get_psne_idx(i + poff)];
-			/*psns_ext will be used only for P5 chips. */
-			sq->swq[i].psn_ext =
-				(struct sq_psn_search_ext *)
-				&psn_search_ptr[get_psne_pg(i + poff)]
-					       [get_psne_idx(i + poff)];
-		}
-	}
+
+	if (psn_sz)
+		bnxt_qplib_init_psn_ptr(qp, psn_sz);
+
+	req.sq_size = cpu_to_le32(bnxt_qplib_set_sq_size(sq, qp->wqe_mode));
 	pbl = &sq->hwq.pbl[PBL_LVL_0];
 	req.sq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
-	req.sq_pg_size_sq_lvl =
-		((sq->hwq.level & CMDQ_CREATE_QP_SQ_LVL_MASK)
-				 <<  CMDQ_CREATE_QP_SQ_LVL_SFT) |
-		(pbl->pg_size == ROCE_PG_SIZE_4K ?
-				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_4K :
-		 pbl->pg_size == ROCE_PG_SIZE_8K ?
-				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_8K :
-		 pbl->pg_size == ROCE_PG_SIZE_64K ?
-				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_64K :
-		 pbl->pg_size == ROCE_PG_SIZE_2M ?
-				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_2M :
-		 pbl->pg_size == ROCE_PG_SIZE_8M ?
-				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_8M :
-		 pbl->pg_size == ROCE_PG_SIZE_1G ?
-				CMDQ_CREATE_QP_SQ_PG_SIZE_PG_1G :
-		 CMDQ_CREATE_QP_SQ_PG_SIZE_PG_4K);
+	pg_sz_lvl = (bnxt_qplib_base_pg_size(&sq->hwq) <<
+		     CMDQ_CREATE_QP_SQ_PG_SIZE_SFT);
+	pg_sz_lvl |= (sq->hwq.level & CMDQ_CREATE_QP_SQ_LVL_MASK);
+	req.sq_pg_size_sq_lvl = pg_sz_lvl;
+	req.sq_fwo_sq_sge =
+		cpu_to_le16(((sq->max_sge & CMDQ_CREATE_QP_SQ_SGE_MASK) <<
+			     CMDQ_CREATE_QP_SQ_SGE_SFT) | 0);
+	req.scq_cid = cpu_to_le32(qp->scq->id);
 
-	if (qp->scq)
-		req.scq_cid = cpu_to_le32(qp->scq->id);
+	/* RQ */
+	if (!qp->srq) {
+		hwq_attr.res = res;
+		hwq_attr.sginfo = &rq->sg_info;
+		hwq_attr.stride = sizeof(struct sq_sge);
+		hwq_attr.depth = bnxt_qplib_get_depth(rq);
+		hwq_attr.aux_stride = 0;
+		hwq_attr.aux_depth = 0;
+		hwq_attr.type = HWQ_TYPE_QUEUE;
+		rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr);
+		if (rc)
+			goto sq_swq;
+		rc = bnxt_qplib_alloc_init_swq(rq);
+		if (rc)
+			goto fail_rq;
+
+		req.rq_size = cpu_to_le32(rq->max_wqe);
+		pbl = &rq->hwq.pbl[PBL_LVL_0];
+		req.rq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
+		pg_sz_lvl = (bnxt_qplib_base_pg_size(&rq->hwq) <<
+			     CMDQ_CREATE_QP_RQ_PG_SIZE_SFT);
+		pg_sz_lvl |= (rq->hwq.level & CMDQ_CREATE_QP_RQ_LVL_MASK);
+		req.rq_pg_size_rq_lvl = pg_sz_lvl;
+		nsge = (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ?
+			6 : rq->max_sge;
+		req.rq_fwo_rq_sge =
+			cpu_to_le16(((nsge &
+				      CMDQ_CREATE_QP_RQ_SGE_MASK) <<
+				     CMDQ_CREATE_QP_RQ_SGE_SFT) | 0);
+	} else {
+		/* SRQ */
+		qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_SRQ_USED;
+		req.srq_cid = cpu_to_le32(qp->srq->id);
+	}
+	req.rcq_cid = cpu_to_le32(qp->rcq->id);
 
 	qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_RESERVED_LKEY_ENABLE;
 	qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FR_PMR_ENABLED;
 	if (qp->sig_type)
 		qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_FORCE_COMPLETION;
-
-	/* RQ */
-	if (rq->max_wqe) {
-		rq->hwq.max_elements = rq->max_wqe;
-		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq,
-					       &rq->sg_info,
-					       &rq->hwq.max_elements,
-					       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
-					       PAGE_SIZE, HWQ_TYPE_QUEUE);
-		if (rc)
-			goto fail_sq;
-
-		rq->swq = kcalloc(rq->hwq.max_elements, sizeof(*rq->swq),
-				  GFP_KERNEL);
-		if (!rq->swq) {
-			rc = -ENOMEM;
-			goto fail_rq;
-		}
-		pbl = &rq->hwq.pbl[PBL_LVL_0];
-		req.rq_pbl = cpu_to_le64(pbl->pg_map_arr[0]);
-		req.rq_pg_size_rq_lvl =
-			((rq->hwq.level & CMDQ_CREATE_QP_RQ_LVL_MASK) <<
-			 CMDQ_CREATE_QP_RQ_LVL_SFT) |
-				(pbl->pg_size == ROCE_PG_SIZE_4K ?
-					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_4K :
-				 pbl->pg_size == ROCE_PG_SIZE_8K ?
-					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_8K :
-				 pbl->pg_size == ROCE_PG_SIZE_64K ?
-					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_64K :
-				 pbl->pg_size == ROCE_PG_SIZE_2M ?
-					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_2M :
-				 pbl->pg_size == ROCE_PG_SIZE_8M ?
-					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_8M :
-				 pbl->pg_size == ROCE_PG_SIZE_1G ?
-					CMDQ_CREATE_QP_RQ_PG_SIZE_PG_1G :
-				 CMDQ_CREATE_QP_RQ_PG_SIZE_PG_4K);
-	} else {
-		/* SRQ */
-		if (qp->srq) {
-			qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_SRQ_USED;
-			req.srq_cid = cpu_to_le32(qp->srq->id);
-		}
-	}
-
-	if (qp->rcq)
-		req.rcq_cid = cpu_to_le32(qp->rcq->id);
+	if (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE)
+		qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_VARIABLE_SIZED_WQE_ENABLED;
 	req.qp_flags = cpu_to_le32(qp_flags);
-	req.sq_size = cpu_to_le32(sq->hwq.max_elements);
-	req.rq_size = cpu_to_le32(rq->hwq.max_elements);
-	qp->sq_hdr_buf = NULL;
-	qp->rq_hdr_buf = NULL;
 
-	rc = bnxt_qplib_alloc_qp_hdr_buf(res, qp);
-	if (rc)
-		goto fail_rq;
-
-	/* CTRL-22434: Irrespective of the requested SGE count on the SQ
-	 * always create the QP with max send sges possible if the requested
-	 * inline size is greater than 0.
-	 */
-	max_ssge = qp->max_inline_data ? 6 : sq->max_sge;
-	req.sq_fwo_sq_sge = cpu_to_le16(
-				((max_ssge & CMDQ_CREATE_QP_SQ_SGE_MASK)
-				 << CMDQ_CREATE_QP_SQ_SGE_SFT) | 0);
-	max_rsge = bnxt_qplib_is_chip_gen_p5(res->cctx) ? 6 : rq->max_sge;
-	req.rq_fwo_rq_sge = cpu_to_le16(
-				((max_rsge & CMDQ_CREATE_QP_RQ_SGE_MASK)
-				 << CMDQ_CREATE_QP_RQ_SGE_SFT) | 0);
 	/* ORRQ and IRRQ */
 	if (psn_sz) {
 		xrrq = &qp->orrq;
@@ -1029,12 +1060,19 @@
 		req_size = xrrq->max_elements *
 			   BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE + PAGE_SIZE - 1;
 		req_size &= ~(PAGE_SIZE - 1);
-		rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL,
-					       &xrrq->max_elements,
-					       BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE,
-					       0, req_size, HWQ_TYPE_CTX);
+		sginfo.pgsize = req_size;
+		sginfo.pgshft = PAGE_SHIFT;
+
+		hwq_attr.res = res;
+		hwq_attr.sginfo = &sginfo;
+		hwq_attr.depth = xrrq->max_elements;
+		hwq_attr.stride = BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE;
+		hwq_attr.aux_stride = 0;
+		hwq_attr.aux_depth = 0;
+		hwq_attr.type = HWQ_TYPE_CTX;
+		rc = bnxt_qplib_alloc_init_hwq(xrrq, &hwq_attr);
 		if (rc)
-			goto fail_buf_free;
+			goto rq_swq;
 		pbl = &xrrq->pbl[PBL_LVL_0];
 		req.orrq_addr = cpu_to_le64(pbl->pg_map_arr[0]);
 
@@ -1044,11 +1082,10 @@
 		req_size = xrrq->max_elements *
 			   BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE + PAGE_SIZE - 1;
 		req_size &= ~(PAGE_SIZE - 1);
-
-		rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL,
-					       &xrrq->max_elements,
-					       BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE,
-					       0, req_size, HWQ_TYPE_CTX);
+		sginfo.pgsize = req_size;
+		hwq_attr.depth =  xrrq->max_elements;
+		hwq_attr.stride = BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE;
+		rc = bnxt_qplib_alloc_init_hwq(xrrq, &hwq_attr);
 		if (rc)
 			goto fail_orrq;
 
@@ -1064,28 +1101,36 @@
 
 	qp->id = le32_to_cpu(resp.xid);
 	qp->cur_qp_state = CMDQ_MODIFY_QP_NEW_STATE_RESET;
-	qp->cctx = res->cctx;
 	INIT_LIST_HEAD(&qp->sq_flush);
 	INIT_LIST_HEAD(&qp->rq_flush);
-	rcfw->qp_tbl[qp->id].qp_id = qp->id;
-	rcfw->qp_tbl[qp->id].qp_handle = (void *)qp;
+	qp->cctx = res->cctx;
+	sq->dbinfo.hwq = &sq->hwq;
+	sq->dbinfo.xid = qp->id;
+	sq->dbinfo.db = qp->dpi->dbr;
+	sq->dbinfo.max_slot = bnxt_qplib_set_sq_max_slot(qp->wqe_mode);
+	if (rq->max_wqe) {
+		rq->dbinfo.hwq = &rq->hwq;
+		rq->dbinfo.xid = qp->id;
+		rq->dbinfo.db = qp->dpi->dbr;
+		rq->dbinfo.max_slot = bnxt_qplib_set_rq_max_slot(rq->wqe_size);
+	}
+	tbl_indx = map_qp_id_to_tbl_indx(qp->id, rcfw);
+	rcfw->qp_tbl[tbl_indx].qp_id = qp->id;
+	rcfw->qp_tbl[tbl_indx].qp_handle = (void *)qp;
 
 	return 0;
-
 fail:
-	if (qp->irrq.max_elements)
-		bnxt_qplib_free_hwq(res->pdev, &qp->irrq);
+	bnxt_qplib_free_hwq(res, &qp->irrq);
 fail_orrq:
-	if (qp->orrq.max_elements)
-		bnxt_qplib_free_hwq(res->pdev, &qp->orrq);
-fail_buf_free:
-	bnxt_qplib_free_qp_hdr_buf(res, qp);
-fail_rq:
-	bnxt_qplib_free_hwq(res->pdev, &rq->hwq);
+	bnxt_qplib_free_hwq(res, &qp->orrq);
+rq_swq:
 	kfree(rq->swq);
-fail_sq:
-	bnxt_qplib_free_hwq(res->pdev, &sq->hwq);
+fail_rq:
+	bnxt_qplib_free_hwq(res, &rq->hwq);
+sq_swq:
 	kfree(sq->swq);
+fail_sq:
+	bnxt_qplib_free_hwq(res, &sq->hwq);
 exit:
 	return rc;
 }
@@ -1371,12 +1416,11 @@
 static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp)
 {
 	struct bnxt_qplib_hwq *cq_hwq = &cq->hwq;
-	struct cq_base *hw_cqe, **hw_cqe_ptr;
+	struct cq_base *hw_cqe;
 	int i;
 
 	for (i = 0; i < cq_hwq->max_elements; i++) {
-		hw_cqe_ptr = (struct cq_base **)cq_hwq->pbl_ptr;
-		hw_cqe = &hw_cqe_ptr[CQE_PG(i)][CQE_IDX(i)];
+		hw_cqe = bnxt_qplib_get_qe(cq_hwq, i, NULL);
 		if (!CQE_CMP_VALID(hw_cqe, i, cq_hwq->max_elements))
 			continue;
 		/*
@@ -1417,10 +1461,12 @@
 	struct cmdq_destroy_qp req;
 	struct creq_destroy_qp_resp resp;
 	u16 cmd_flags = 0;
+	u32 tbl_indx;
 	int rc;
 
-	rcfw->qp_tbl[qp->id].qp_id = BNXT_QPLIB_QP_ID_INVALID;
-	rcfw->qp_tbl[qp->id].qp_handle = NULL;
+	tbl_indx = map_qp_id_to_tbl_indx(qp->id, rcfw);
+	rcfw->qp_tbl[tbl_indx].qp_id = BNXT_QPLIB_QP_ID_INVALID;
+	rcfw->qp_tbl[tbl_indx].qp_handle = NULL;
 
 	RCFW_CMD_PREP(req, DESTROY_QP, cmd_flags);
 
@@ -1428,8 +1474,8 @@
 	rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
 					  (void *)&resp, NULL, 0);
 	if (rc) {
-		rcfw->qp_tbl[qp->id].qp_id = qp->id;
-		rcfw->qp_tbl[qp->id].qp_handle = qp;
+		rcfw->qp_tbl[tbl_indx].qp_id = qp->id;
+		rcfw->qp_tbl[tbl_indx].qp_handle = qp;
 		return rc;
 	}
 
@@ -1440,16 +1486,16 @@
 			    struct bnxt_qplib_qp *qp)
 {
 	bnxt_qplib_free_qp_hdr_buf(res, qp);
-	bnxt_qplib_free_hwq(res->pdev, &qp->sq.hwq);
+	bnxt_qplib_free_hwq(res, &qp->sq.hwq);
 	kfree(qp->sq.swq);
 
-	bnxt_qplib_free_hwq(res->pdev, &qp->rq.hwq);
+	bnxt_qplib_free_hwq(res, &qp->rq.hwq);
 	kfree(qp->rq.swq);
 
 	if (qp->irrq.max_elements)
-		bnxt_qplib_free_hwq(res->pdev, &qp->irrq);
+		bnxt_qplib_free_hwq(res, &qp->irrq);
 	if (qp->orrq.max_elements)
-		bnxt_qplib_free_hwq(res->pdev, &qp->orrq);
+		bnxt_qplib_free_hwq(res, &qp->orrq);
 
 }
 
@@ -1462,7 +1508,7 @@
 	memset(sge, 0, sizeof(*sge));
 
 	if (qp->sq_hdr_buf) {
-		sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+		sw_prod = sq->swq_start;
 		sge->addr = (dma_addr_t)(qp->sq_hdr_buf_map +
 					 sw_prod * qp->sq_hdr_buf_size);
 		sge->lkey = 0xFFFFFFFF;
@@ -1476,7 +1522,7 @@
 {
 	struct bnxt_qplib_q *rq = &qp->rq;
 
-	return HWQ_CMP(rq->hwq.prod, &rq->hwq);
+	return rq->swq_start;
 }
 
 dma_addr_t bnxt_qplib_get_qp_buf_from_index(struct bnxt_qplib_qp *qp, u32 index)
@@ -1493,7 +1539,7 @@
 	memset(sge, 0, sizeof(*sge));
 
 	if (qp->rq_hdr_buf) {
-		sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+		sw_prod = rq->swq_start;
 		sge->addr = (dma_addr_t)(qp->rq_hdr_buf_map +
 					 sw_prod * qp->rq_hdr_buf_size);
 		sge->lkey = 0xFFFFFFFF;
@@ -1503,142 +1549,264 @@
 	return NULL;
 }
 
+static void bnxt_qplib_fill_psn_search(struct bnxt_qplib_qp *qp,
+				       struct bnxt_qplib_swqe *wqe,
+				       struct bnxt_qplib_swq *swq)
+{
+	struct sq_psn_search_ext *psns_ext;
+	struct sq_psn_search *psns;
+	u32 flg_npsn;
+	u32 op_spsn;
+
+	if (!swq->psn_search)
+		return;
+	psns = swq->psn_search;
+	psns_ext = swq->psn_ext;
+
+	op_spsn = ((swq->start_psn << SQ_PSN_SEARCH_START_PSN_SFT) &
+		    SQ_PSN_SEARCH_START_PSN_MASK);
+	op_spsn |= ((wqe->type << SQ_PSN_SEARCH_OPCODE_SFT) &
+		     SQ_PSN_SEARCH_OPCODE_MASK);
+	flg_npsn = ((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) &
+		     SQ_PSN_SEARCH_NEXT_PSN_MASK);
+
+	if (bnxt_qplib_is_chip_gen_p5(qp->cctx)) {
+		psns_ext->opcode_start_psn = cpu_to_le32(op_spsn);
+		psns_ext->flags_next_psn = cpu_to_le32(flg_npsn);
+		psns_ext->start_slot_idx = cpu_to_le16(swq->slot_idx);
+	} else {
+		psns->opcode_start_psn = cpu_to_le32(op_spsn);
+		psns->flags_next_psn = cpu_to_le32(flg_npsn);
+	}
+}
+
+static int bnxt_qplib_put_inline(struct bnxt_qplib_qp *qp,
+				 struct bnxt_qplib_swqe *wqe,
+				 u16 *idx)
+{
+	struct bnxt_qplib_hwq *hwq;
+	int len, t_len, offt;
+	bool pull_dst = true;
+	void *il_dst = NULL;
+	void *il_src = NULL;
+	int t_cplen, cplen;
+	int indx;
+
+	hwq = &qp->sq.hwq;
+	t_len = 0;
+	for (indx = 0; indx < wqe->num_sge; indx++) {
+		len = wqe->sg_list[indx].size;
+		il_src = (void *)wqe->sg_list[indx].addr;
+		t_len += len;
+		if (t_len > qp->max_inline_data)
+			goto bad;
+		while (len) {
+			if (pull_dst) {
+				pull_dst = false;
+				il_dst = bnxt_qplib_get_prod_qe(hwq, *idx);
+				(*idx)++;
+				t_cplen = 0;
+				offt = 0;
+			}
+			cplen = min_t(int, len, sizeof(struct sq_sge));
+			cplen = min_t(int, cplen,
+					(sizeof(struct sq_sge) - offt));
+			memcpy(il_dst, il_src, cplen);
+			t_cplen += cplen;
+			il_src += cplen;
+			il_dst += cplen;
+			offt += cplen;
+			len -= cplen;
+			if (t_cplen == sizeof(struct sq_sge))
+				pull_dst = true;
+		}
+	}
+
+	return t_len;
+bad:
+	return -ENOMEM;
+}
+
+static u32 bnxt_qplib_put_sges(struct bnxt_qplib_hwq *hwq,
+			       struct bnxt_qplib_sge *ssge,
+			       u16 nsge, u16 *idx)
+{
+	struct sq_sge *dsge;
+	int indx, len = 0;
+
+	for (indx = 0; indx < nsge; indx++, (*idx)++) {
+		dsge = bnxt_qplib_get_prod_qe(hwq, *idx);
+		dsge->va_or_pa = cpu_to_le64(ssge[indx].addr);
+		dsge->l_key = cpu_to_le32(ssge[indx].lkey);
+		dsge->size = cpu_to_le32(ssge[indx].size);
+		len += ssge[indx].size;
+	}
+
+	return len;
+}
+
+static u16 bnxt_qplib_required_slots(struct bnxt_qplib_qp *qp,
+				     struct bnxt_qplib_swqe *wqe,
+				     u16 *wqe_sz, u16 *qdf, u8 mode)
+{
+	u32 ilsize, bytes;
+	u16 nsge;
+	u16 slot;
+
+	nsge = wqe->num_sge;
+	/* Adding sq_send_hdr is a misnomer, for rq also hdr size is same. */
+	bytes = sizeof(struct sq_send_hdr) + nsge * sizeof(struct sq_sge);
+	if (wqe->flags & BNXT_QPLIB_SWQE_FLAGS_INLINE) {
+		ilsize = bnxt_qplib_calc_ilsize(wqe, qp->max_inline_data);
+		bytes = ALIGN(ilsize, sizeof(struct sq_sge));
+		bytes += sizeof(struct sq_send_hdr);
+	}
+
+	*qdf =  __xlate_qfd(qp->sq.q_full_delta, bytes);
+	slot = bytes >> 4;
+	*wqe_sz = slot;
+	if (mode == BNXT_QPLIB_WQE_MODE_STATIC)
+		slot = 8;
+	return slot;
+}
+
+static void bnxt_qplib_pull_psn_buff(struct bnxt_qplib_q *sq,
+				     struct bnxt_qplib_swq *swq)
+{
+	struct bnxt_qplib_hwq *hwq;
+	u32 pg_num, pg_indx;
+	void *buff;
+	u32 tail;
+
+	hwq = &sq->hwq;
+	if (!hwq->pad_pg)
+		return;
+	tail = swq->slot_idx / sq->dbinfo.max_slot;
+	pg_num = (tail + hwq->pad_pgofft) / (PAGE_SIZE / hwq->pad_stride);
+	pg_indx = (tail + hwq->pad_pgofft) % (PAGE_SIZE / hwq->pad_stride);
+	buff = (void *)(hwq->pad_pg[pg_num] + pg_indx * hwq->pad_stride);
+	swq->psn_ext = buff;
+	swq->psn_search = buff;
+}
+
 void bnxt_qplib_post_send_db(struct bnxt_qplib_qp *qp)
 {
 	struct bnxt_qplib_q *sq = &qp->sq;
-	u32 sw_prod;
-	u64 val = 0;
 
-	val = (((qp->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) |
-	       DBC_DBC_TYPE_SQ);
-	val <<= 32;
-	sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
-	val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
-	/* Flush all the WQE writes to HW */
-	writeq(val, qp->dpi->dbr);
+	bnxt_qplib_ring_prod_db(&sq->dbinfo, DBC_DBC_TYPE_SQ);
 }
 
 int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp,
 			 struct bnxt_qplib_swqe *wqe)
 {
-	struct bnxt_qplib_q *sq = &qp->sq;
-	struct bnxt_qplib_swq *swq;
-	struct sq_send *hw_sq_send_hdr, **hw_sq_send_ptr;
-	struct sq_sge *hw_sge;
 	struct bnxt_qplib_nq_work *nq_work = NULL;
-	bool sch_handler = false;
-	u32 sw_prod;
-	u8 wqe_size16;
 	int i, rc = 0, data_len = 0, pkt_num = 0;
+	struct bnxt_qplib_q *sq = &qp->sq;
+	struct bnxt_qplib_hwq *hwq;
+	struct bnxt_qplib_swq *swq;
+	bool sch_handler = false;
+	u16 wqe_sz, qdf = 0;
+	void *base_hdr;
+	void *ext_hdr;
 	__le32 temp32;
+	u32 wqe_idx;
+	u32 slots;
+	u16 idx;
 
-	if (qp->state != CMDQ_MODIFY_QP_NEW_STATE_RTS) {
-		if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
-			sch_handler = true;
-			dev_dbg(&sq->hwq.pdev->dev,
-				"%s Error QP. Scheduling for poll_cq\n",
-				__func__);
-			goto queue_err;
-		}
+	hwq = &sq->hwq;
+	if (qp->state != CMDQ_MODIFY_QP_NEW_STATE_RTS &&
+	    qp->state != CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+		dev_err(&hwq->pdev->dev,
+			"QPLIB: FP: QP (0x%x) is in the 0x%x state",
+			qp->id, qp->state);
+		rc = -EINVAL;
+		goto done;
 	}
 
-	if (bnxt_qplib_queue_full(sq)) {
-		dev_err(&sq->hwq.pdev->dev,
+	slots = bnxt_qplib_required_slots(qp, wqe, &wqe_sz, &qdf, qp->wqe_mode);
+	if (bnxt_qplib_queue_full(sq, slots + qdf)) {
+		dev_err(&hwq->pdev->dev,
 			"prod = %#x cons = %#x qdepth = %#x delta = %#x\n",
-			sq->hwq.prod, sq->hwq.cons, sq->hwq.max_elements,
-			sq->q_full_delta);
+			hwq->prod, hwq->cons, hwq->depth, sq->q_full_delta);
 		rc = -ENOMEM;
 		goto done;
 	}
-	sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
-	swq = &sq->swq[sw_prod];
+
+	swq = bnxt_qplib_get_swqe(sq, &wqe_idx);
+	bnxt_qplib_pull_psn_buff(sq, swq);
+
+	idx = 0;
+	swq->slot_idx = hwq->prod;
+	swq->slots = slots;
 	swq->wr_id = wqe->wr_id;
 	swq->type = wqe->type;
 	swq->flags = wqe->flags;
+	swq->start_psn = sq->psn & BTH_PSN_MASK;
 	if (qp->sig_type)
 		swq->flags |= SQ_SEND_FLAGS_SIGNAL_COMP;
-	swq->start_psn = sq->psn & BTH_PSN_MASK;
 
-	hw_sq_send_ptr = (struct sq_send **)sq->hwq.pbl_ptr;
-	hw_sq_send_hdr = &hw_sq_send_ptr[get_sqe_pg(sw_prod)]
-					[get_sqe_idx(sw_prod)];
-
-	memset(hw_sq_send_hdr, 0, BNXT_QPLIB_MAX_SQE_ENTRY_SIZE);
-
-	if (wqe->flags & BNXT_QPLIB_SWQE_FLAGS_INLINE) {
-		/* Copy the inline data */
-		if (wqe->inline_len > BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH) {
-			dev_warn(&sq->hwq.pdev->dev,
-				 "Inline data length > 96 detected\n");
-			data_len = BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH;
-		} else {
-			data_len = wqe->inline_len;
-		}
-		memcpy(hw_sq_send_hdr->data, wqe->inline_data, data_len);
-		wqe_size16 = (data_len + 15) >> 4;
-	} else {
-		for (i = 0, hw_sge = (struct sq_sge *)hw_sq_send_hdr->data;
-		     i < wqe->num_sge; i++, hw_sge++) {
-			hw_sge->va_or_pa = cpu_to_le64(wqe->sg_list[i].addr);
-			hw_sge->l_key = cpu_to_le32(wqe->sg_list[i].lkey);
-			hw_sge->size = cpu_to_le32(wqe->sg_list[i].size);
-			data_len += wqe->sg_list[i].size;
-		}
-		/* Each SGE entry = 1 WQE size16 */
-		wqe_size16 = wqe->num_sge;
-		/* HW requires wqe size has room for atleast one SGE even if
-		 * none was supplied by ULP
-		 */
-		if (!wqe->num_sge)
-			wqe_size16++;
+	if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+		sch_handler = true;
+		dev_dbg(&hwq->pdev->dev,
+			"%s Error QP. Scheduling for poll_cq\n", __func__);
+		goto queue_err;
 	}
 
+	base_hdr = bnxt_qplib_get_prod_qe(hwq, idx++);
+	ext_hdr = bnxt_qplib_get_prod_qe(hwq, idx++);
+	memset(base_hdr, 0, sizeof(struct sq_sge));
+	memset(ext_hdr, 0, sizeof(struct sq_sge));
+
+	if (wqe->flags & BNXT_QPLIB_SWQE_FLAGS_INLINE)
+		/* Copy the inline data */
+		data_len = bnxt_qplib_put_inline(qp, wqe, &idx);
+	else
+		data_len = bnxt_qplib_put_sges(hwq, wqe->sg_list, wqe->num_sge,
+					       &idx);
+	if (data_len < 0)
+		goto queue_err;
 	/* Specifics */
 	switch (wqe->type) {
 	case BNXT_QPLIB_SWQE_TYPE_SEND:
 		if (qp->type == CMDQ_CREATE_QP1_TYPE_GSI) {
+			struct sq_send_raweth_qp1_hdr *sqe = base_hdr;
+			struct sq_raw_ext_hdr *ext_sqe = ext_hdr;
 			/* Assemble info for Raw Ethertype QPs */
-			struct sq_send_raweth_qp1 *sqe =
-				(struct sq_send_raweth_qp1 *)hw_sq_send_hdr;
 
 			sqe->wqe_type = wqe->type;
 			sqe->flags = wqe->flags;
-			sqe->wqe_size = wqe_size16 +
-				((offsetof(typeof(*sqe), data) + 15) >> 4);
+			sqe->wqe_size = wqe_sz;
 			sqe->cfa_action = cpu_to_le16(wqe->rawqp1.cfa_action);
 			sqe->lflags = cpu_to_le16(wqe->rawqp1.lflags);
 			sqe->length = cpu_to_le32(data_len);
-			sqe->cfa_meta = cpu_to_le32((wqe->rawqp1.cfa_meta &
+			ext_sqe->cfa_meta = cpu_to_le32((wqe->rawqp1.cfa_meta &
 				SQ_SEND_RAWETH_QP1_CFA_META_VLAN_VID_MASK) <<
 				SQ_SEND_RAWETH_QP1_CFA_META_VLAN_VID_SFT);
 
 			break;
 		}
-		/* fall thru */
+		fallthrough;
 	case BNXT_QPLIB_SWQE_TYPE_SEND_WITH_IMM:
 	case BNXT_QPLIB_SWQE_TYPE_SEND_WITH_INV:
 	{
-		struct sq_send *sqe = (struct sq_send *)hw_sq_send_hdr;
+		struct sq_ud_ext_hdr *ext_sqe = ext_hdr;
+		struct sq_send_hdr *sqe = base_hdr;
 
 		sqe->wqe_type = wqe->type;
 		sqe->flags = wqe->flags;
-		sqe->wqe_size = wqe_size16 +
-				((offsetof(typeof(*sqe), data) + 15) >> 4);
-		sqe->inv_key_or_imm_data = cpu_to_le32(
-						wqe->send.inv_key);
+		sqe->wqe_size = wqe_sz;
+		sqe->inv_key_or_imm_data = cpu_to_le32(wqe->send.inv_key);
 		if (qp->type == CMDQ_CREATE_QP_TYPE_UD ||
 		    qp->type == CMDQ_CREATE_QP_TYPE_GSI) {
 			sqe->q_key = cpu_to_le32(wqe->send.q_key);
-			sqe->dst_qp = cpu_to_le32(
-					wqe->send.dst_qp & SQ_SEND_DST_QP_MASK);
 			sqe->length = cpu_to_le32(data_len);
-			sqe->avid = cpu_to_le32(wqe->send.avid &
-						SQ_SEND_AVID_MASK);
 			sq->psn = (sq->psn + 1) & BTH_PSN_MASK;
+			ext_sqe->dst_qp = cpu_to_le32(wqe->send.dst_qp &
+						      SQ_SEND_DST_QP_MASK);
+			ext_sqe->avid = cpu_to_le32(wqe->send.avid &
+						    SQ_SEND_AVID_MASK);
 		} else {
 			sqe->length = cpu_to_le32(data_len);
-			sqe->dst_qp = 0;
-			sqe->avid = 0;
 			if (qp->mtu)
 				pkt_num = (data_len + qp->mtu - 1) / qp->mtu;
 			if (!pkt_num)
@@ -1651,16 +1819,16 @@
 	case BNXT_QPLIB_SWQE_TYPE_RDMA_WRITE_WITH_IMM:
 	case BNXT_QPLIB_SWQE_TYPE_RDMA_READ:
 	{
-		struct sq_rdma *sqe = (struct sq_rdma *)hw_sq_send_hdr;
+		struct sq_rdma_ext_hdr *ext_sqe = ext_hdr;
+		struct sq_rdma_hdr *sqe = base_hdr;
 
 		sqe->wqe_type = wqe->type;
 		sqe->flags = wqe->flags;
-		sqe->wqe_size = wqe_size16 +
-				((offsetof(typeof(*sqe), data) + 15) >> 4);
+		sqe->wqe_size = wqe_sz;
 		sqe->imm_data = cpu_to_le32(wqe->rdma.inv_key);
 		sqe->length = cpu_to_le32((u32)data_len);
-		sqe->remote_va = cpu_to_le64(wqe->rdma.remote_va);
-		sqe->remote_key = cpu_to_le32(wqe->rdma.r_key);
+		ext_sqe->remote_va = cpu_to_le64(wqe->rdma.remote_va);
+		ext_sqe->remote_key = cpu_to_le32(wqe->rdma.r_key);
 		if (qp->mtu)
 			pkt_num = (data_len + qp->mtu - 1) / qp->mtu;
 		if (!pkt_num)
@@ -1671,14 +1839,15 @@
 	case BNXT_QPLIB_SWQE_TYPE_ATOMIC_CMP_AND_SWP:
 	case BNXT_QPLIB_SWQE_TYPE_ATOMIC_FETCH_AND_ADD:
 	{
-		struct sq_atomic *sqe = (struct sq_atomic *)hw_sq_send_hdr;
+		struct sq_atomic_ext_hdr *ext_sqe = ext_hdr;
+		struct sq_atomic_hdr *sqe = base_hdr;
 
 		sqe->wqe_type = wqe->type;
 		sqe->flags = wqe->flags;
 		sqe->remote_key = cpu_to_le32(wqe->atomic.r_key);
 		sqe->remote_va = cpu_to_le64(wqe->atomic.remote_va);
-		sqe->swap_data = cpu_to_le64(wqe->atomic.swap_data);
-		sqe->cmp_data = cpu_to_le64(wqe->atomic.cmp_data);
+		ext_sqe->swap_data = cpu_to_le64(wqe->atomic.swap_data);
+		ext_sqe->cmp_data = cpu_to_le64(wqe->atomic.cmp_data);
 		if (qp->mtu)
 			pkt_num = (data_len + qp->mtu - 1) / qp->mtu;
 		if (!pkt_num)
@@ -1688,8 +1857,7 @@
 	}
 	case BNXT_QPLIB_SWQE_TYPE_LOCAL_INV:
 	{
-		struct sq_localinvalidate *sqe =
-				(struct sq_localinvalidate *)hw_sq_send_hdr;
+		struct sq_localinvalidate *sqe = base_hdr;
 
 		sqe->wqe_type = wqe->type;
 		sqe->flags = wqe->flags;
@@ -1699,7 +1867,8 @@
 	}
 	case BNXT_QPLIB_SWQE_TYPE_FAST_REG_MR:
 	{
-		struct sq_fr_pmr *sqe = (struct sq_fr_pmr *)hw_sq_send_hdr;
+		struct sq_fr_pmr_ext_hdr *ext_sqe = ext_hdr;
+		struct sq_fr_pmr_hdr *sqe = base_hdr;
 
 		sqe->wqe_type = wqe->type;
 		sqe->flags = wqe->flags;
@@ -1723,14 +1892,15 @@
 			wqe->frmr.pbl_ptr[i] = cpu_to_le64(
 						wqe->frmr.page_list[i] |
 						PTU_PTE_VALID);
-		sqe->pblptr = cpu_to_le64(wqe->frmr.pbl_dma_ptr);
-		sqe->va = cpu_to_le64(wqe->frmr.va);
+		ext_sqe->pblptr = cpu_to_le64(wqe->frmr.pbl_dma_ptr);
+		ext_sqe->va = cpu_to_le64(wqe->frmr.va);
 
 		break;
 	}
 	case BNXT_QPLIB_SWQE_TYPE_BIND_MW:
 	{
-		struct sq_bind *sqe = (struct sq_bind *)hw_sq_send_hdr;
+		struct sq_bind_ext_hdr *ext_sqe = ext_hdr;
+		struct sq_bind_hdr *sqe = base_hdr;
 
 		sqe->wqe_type = wqe->type;
 		sqe->flags = wqe->flags;
@@ -1739,9 +1909,8 @@
 			(wqe->bind.zero_based ? SQ_BIND_ZERO_BASED : 0);
 		sqe->parent_l_key = cpu_to_le32(wqe->bind.parent_l_key);
 		sqe->l_key = cpu_to_le32(wqe->bind.r_key);
-		sqe->va = cpu_to_le64(wqe->bind.va);
-		temp32 = cpu_to_le32(wqe->bind.length);
-		memcpy(&sqe->length, &temp32, sizeof(wqe->bind.length));
+		ext_sqe->va = cpu_to_le64(wqe->bind.va);
+		ext_sqe->length_lo = cpu_to_le32(wqe->bind.length);
 		break;
 	}
 	default:
@@ -1750,43 +1919,11 @@
 		goto done;
 	}
 	swq->next_psn = sq->psn & BTH_PSN_MASK;
-	if (swq->psn_search) {
-		u32 opcd_spsn;
-		u32 flg_npsn;
-
-		opcd_spsn = ((swq->start_psn << SQ_PSN_SEARCH_START_PSN_SFT) &
-			      SQ_PSN_SEARCH_START_PSN_MASK);
-		opcd_spsn |= ((wqe->type << SQ_PSN_SEARCH_OPCODE_SFT) &
-			       SQ_PSN_SEARCH_OPCODE_MASK);
-		flg_npsn = ((swq->next_psn << SQ_PSN_SEARCH_NEXT_PSN_SFT) &
-			     SQ_PSN_SEARCH_NEXT_PSN_MASK);
-		if (bnxt_qplib_is_chip_gen_p5(qp->cctx)) {
-			swq->psn_ext->opcode_start_psn =
-						cpu_to_le32(opcd_spsn);
-			swq->psn_ext->flags_next_psn =
-						cpu_to_le32(flg_npsn);
-		} else {
-			swq->psn_search->opcode_start_psn =
-						cpu_to_le32(opcd_spsn);
-			swq->psn_search->flags_next_psn =
-						cpu_to_le32(flg_npsn);
-		}
-	}
+	bnxt_qplib_fill_psn_search(qp, wqe, swq);
 queue_err:
-	if (sch_handler) {
-		/* Store the ULP info in the software structures */
-		sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
-		swq = &sq->swq[sw_prod];
-		swq->wr_id = wqe->wr_id;
-		swq->type = wqe->type;
-		swq->flags = wqe->flags;
-		if (qp->sig_type)
-			swq->flags |= SQ_SEND_FLAGS_SIGNAL_COMP;
-		swq->start_psn = sq->psn & BTH_PSN_MASK;
-	}
-	sq->hwq.prod++;
+	bnxt_qplib_swq_mod_start(sq, wqe_idx);
+	bnxt_qplib_hwq_incr_prod(hwq, swq->slots);
 	qp->wqe_cnt++;
-
 done:
 	if (sch_handler) {
 		nq_work = kzalloc(sizeof(*nq_work), GFP_ATOMIC);
@@ -1796,7 +1933,7 @@
 			INIT_WORK(&nq_work->work, bnxt_qpn_cqn_sched_task);
 			queue_work(qp->scq->nq->cqn_wq, &nq_work->work);
 		} else {
-			dev_err(&sq->hwq.pdev->dev,
+			dev_err(&hwq->pdev->dev,
 				"FP: Failed to allocate SQ nq_work!\n");
 			rc = -ENOMEM;
 		}
@@ -1807,77 +1944,74 @@
 void bnxt_qplib_post_recv_db(struct bnxt_qplib_qp *qp)
 {
 	struct bnxt_qplib_q *rq = &qp->rq;
-	u32 sw_prod;
-	u64 val = 0;
 
-	val = (((qp->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) |
-	       DBC_DBC_TYPE_RQ);
-	val <<= 32;
-	sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
-	val |= (sw_prod << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
-	/* Flush the writes to HW Rx WQE before the ringing Rx DB */
-	writeq(val, qp->dpi->dbr);
+	bnxt_qplib_ring_prod_db(&rq->dbinfo, DBC_DBC_TYPE_RQ);
 }
 
 int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp,
 			 struct bnxt_qplib_swqe *wqe)
 {
-	struct bnxt_qplib_q *rq = &qp->rq;
-	struct rq_wqe *rqe, **rqe_ptr;
-	struct sq_sge *hw_sge;
 	struct bnxt_qplib_nq_work *nq_work = NULL;
+	struct bnxt_qplib_q *rq = &qp->rq;
+	struct rq_wqe_hdr *base_hdr;
+	struct rq_ext_hdr *ext_hdr;
+	struct bnxt_qplib_hwq *hwq;
+	struct bnxt_qplib_swq *swq;
 	bool sch_handler = false;
-	u32 sw_prod;
-	int i, rc = 0;
+	u16 wqe_sz, idx;
+	u32 wqe_idx;
+	int rc = 0;
 
-	if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
-		sch_handler = true;
-		dev_dbg(&rq->hwq.pdev->dev,
-			"%s: Error QP. Scheduling for poll_cq\n", __func__);
-		goto queue_err;
+	hwq = &rq->hwq;
+	if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_RESET) {
+		dev_err(&hwq->pdev->dev,
+			"QPLIB: FP: QP (0x%x) is in the 0x%x state",
+			qp->id, qp->state);
+		rc = -EINVAL;
+		goto done;
 	}
-	if (bnxt_qplib_queue_full(rq)) {
-		dev_err(&rq->hwq.pdev->dev,
+
+	if (bnxt_qplib_queue_full(rq, rq->dbinfo.max_slot)) {
+		dev_err(&hwq->pdev->dev,
 			"FP: QP (0x%x) RQ is full!\n", qp->id);
 		rc = -EINVAL;
 		goto done;
 	}
-	sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
-	rq->swq[sw_prod].wr_id = wqe->wr_id;
 
-	rqe_ptr = (struct rq_wqe **)rq->hwq.pbl_ptr;
-	rqe = &rqe_ptr[RQE_PG(sw_prod)][RQE_IDX(sw_prod)];
+	swq = bnxt_qplib_get_swqe(rq, &wqe_idx);
+	swq->wr_id = wqe->wr_id;
+	swq->slots = rq->dbinfo.max_slot;
 
-	memset(rqe, 0, BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
-
-	/* Calculate wqe_size16 and data_len */
-	for (i = 0, hw_sge = (struct sq_sge *)rqe->data;
-	     i < wqe->num_sge; i++, hw_sge++) {
-		hw_sge->va_or_pa = cpu_to_le64(wqe->sg_list[i].addr);
-		hw_sge->l_key = cpu_to_le32(wqe->sg_list[i].lkey);
-		hw_sge->size = cpu_to_le32(wqe->sg_list[i].size);
+	if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) {
+		sch_handler = true;
+		dev_dbg(&hwq->pdev->dev,
+			"%s: Error QP. Scheduling for poll_cq\n", __func__);
+		goto queue_err;
 	}
-	rqe->wqe_type = wqe->type;
-	rqe->flags = wqe->flags;
-	rqe->wqe_size = wqe->num_sge +
-			((offsetof(typeof(*rqe), data) + 15) >> 4);
-	/* HW requires wqe size has room for atleast one SGE even if none
-	 * was supplied by ULP
-	 */
-	if (!wqe->num_sge)
-		rqe->wqe_size++;
 
-	/* Supply the rqe->wr_id index to the wr_id_tbl for now */
-	rqe->wr_id[0] = cpu_to_le32(sw_prod);
+	idx = 0;
+	base_hdr = bnxt_qplib_get_prod_qe(hwq, idx++);
+	ext_hdr = bnxt_qplib_get_prod_qe(hwq, idx++);
+	memset(base_hdr, 0, sizeof(struct sq_sge));
+	memset(ext_hdr, 0, sizeof(struct sq_sge));
+	wqe_sz = (sizeof(struct rq_wqe_hdr) +
+	wqe->num_sge * sizeof(struct sq_sge)) >> 4;
+	bnxt_qplib_put_sges(hwq, wqe->sg_list, wqe->num_sge, &idx);
+	if (!wqe->num_sge) {
+		struct sq_sge *sge;
 
+		sge = bnxt_qplib_get_prod_qe(hwq, idx++);
+		sge->size = 0;
+		wqe_sz++;
+	}
+	base_hdr->wqe_type = wqe->type;
+	base_hdr->flags = wqe->flags;
+	base_hdr->wqe_size = wqe_sz;
+	base_hdr->wr_id[0] = cpu_to_le32(wqe_idx);
 queue_err:
-	if (sch_handler) {
-		/* Store the ULP info in the software structures */
-		sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
-		rq->swq[sw_prod].wr_id = wqe->wr_id;
-	}
-
-	rq->hwq.prod++;
+	bnxt_qplib_swq_mod_start(rq, wqe_idx);
+	bnxt_qplib_hwq_incr_prod(hwq, swq->slots);
+done:
 	if (sch_handler) {
 		nq_work = kzalloc(sizeof(*nq_work), GFP_ATOMIC);
 		if (nq_work) {
@@ -1886,58 +2020,33 @@
 			INIT_WORK(&nq_work->work, bnxt_qpn_cqn_sched_task);
 			queue_work(qp->rcq->nq->cqn_wq, &nq_work->work);
 		} else {
-			dev_err(&rq->hwq.pdev->dev,
+			dev_err(&hwq->pdev->dev,
 				"FP: Failed to allocate RQ nq_work!\n");
 			rc = -ENOMEM;
 		}
 	}
-done:
+
 	return rc;
 }
 
 /* CQ */
-
-/* Spinlock must be held */
-static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq)
-{
-	u64 val = 0;
-
-	val = ((cq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) |
-	       DBC_DBC_TYPE_CQ_ARMENA;
-	val <<= 32;
-	/* Flush memory writes before enabling the CQ */
-	writeq(val, cq->dbr_base);
-}
-
-static void bnxt_qplib_arm_cq(struct bnxt_qplib_cq *cq, u32 arm_type)
-{
-	struct bnxt_qplib_hwq *cq_hwq = &cq->hwq;
-	u32 sw_cons;
-	u64 val = 0;
-
-	/* Ring DB */
-	val = ((cq->id << DBC_DBC_XID_SFT) & DBC_DBC_XID_MASK) | arm_type;
-	val <<= 32;
-	sw_cons = HWQ_CMP(cq_hwq->cons, cq_hwq);
-	val |= (sw_cons << DBC_DBC_INDEX_SFT) & DBC_DBC_INDEX_MASK;
-	/* flush memory writes before arming the CQ */
-	writeq(val, cq->dpi->dbr);
-}
-
 int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
 {
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
-	struct cmdq_create_cq req;
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
 	struct creq_create_cq_resp resp;
 	struct bnxt_qplib_pbl *pbl;
+	struct cmdq_create_cq req;
 	u16 cmd_flags = 0;
+	u32 pg_sz_lvl;
 	int rc;
 
-	cq->hwq.max_elements = cq->max_wqe;
-	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &cq->hwq, &cq->sg_info,
-				       &cq->hwq.max_elements,
-				       BNXT_QPLIB_MAX_CQE_ENTRY_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_QUEUE);
+	hwq_attr.res = res;
+	hwq_attr.depth = cq->max_wqe;
+	hwq_attr.stride = sizeof(struct cq_base);
+	hwq_attr.type = HWQ_TYPE_QUEUE;
+	hwq_attr.sginfo = &cq->sg_info;
+	rc = bnxt_qplib_alloc_init_hwq(&cq->hwq, &hwq_attr);
 	if (rc)
 		goto exit;
 
@@ -1950,22 +2059,13 @@
 	}
 	req.dpi = cpu_to_le32(cq->dpi->dpi);
 	req.cq_handle = cpu_to_le64(cq->cq_handle);
-
 	req.cq_size = cpu_to_le32(cq->hwq.max_elements);
 	pbl = &cq->hwq.pbl[PBL_LVL_0];
-	req.pg_size_lvl = cpu_to_le32(
-	    ((cq->hwq.level & CMDQ_CREATE_CQ_LVL_MASK) <<
-						CMDQ_CREATE_CQ_LVL_SFT) |
-	    (pbl->pg_size == ROCE_PG_SIZE_4K ? CMDQ_CREATE_CQ_PG_SIZE_PG_4K :
-	     pbl->pg_size == ROCE_PG_SIZE_8K ? CMDQ_CREATE_CQ_PG_SIZE_PG_8K :
-	     pbl->pg_size == ROCE_PG_SIZE_64K ? CMDQ_CREATE_CQ_PG_SIZE_PG_64K :
-	     pbl->pg_size == ROCE_PG_SIZE_2M ? CMDQ_CREATE_CQ_PG_SIZE_PG_2M :
-	     pbl->pg_size == ROCE_PG_SIZE_8M ? CMDQ_CREATE_CQ_PG_SIZE_PG_8M :
-	     pbl->pg_size == ROCE_PG_SIZE_1G ? CMDQ_CREATE_CQ_PG_SIZE_PG_1G :
-	     CMDQ_CREATE_CQ_PG_SIZE_PG_4K));
-
+	pg_sz_lvl = (bnxt_qplib_base_pg_size(&cq->hwq) <<
+		     CMDQ_CREATE_CQ_PG_SIZE_SFT);
+	pg_sz_lvl |= (cq->hwq.level & CMDQ_CREATE_CQ_LVL_MASK);
+	req.pg_size_lvl = cpu_to_le32(pg_sz_lvl);
 	req.pbl = cpu_to_le64(pbl->pg_map_arr[0]);
-
 	req.cq_fco_cnq_id = cpu_to_le32(
 			(cq->cnq_hw_ring_id & CMDQ_CREATE_CQ_CNQ_ID_MASK) <<
 			 CMDQ_CREATE_CQ_CNQ_ID_SFT);
@@ -1976,7 +2076,6 @@
 		goto fail;
 
 	cq->id = le32_to_cpu(resp.xid);
-	cq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem;
 	cq->period = BNXT_QPLIB_QUEUE_START_PERIOD;
 	init_waitqueue_head(&cq->waitq);
 	INIT_LIST_HEAD(&cq->sqf_head);
@@ -1984,11 +2083,17 @@
 	spin_lock_init(&cq->compl_lock);
 	spin_lock_init(&cq->flush_lock);
 
-	bnxt_qplib_arm_cq_enable(cq);
+	cq->dbinfo.hwq = &cq->hwq;
+	cq->dbinfo.xid = cq->id;
+	cq->dbinfo.db = cq->dpi->dbr;
+	cq->dbinfo.priv_db = res->dpi_tbl.dbr_bar_reg_iomem;
+
+	bnxt_qplib_armen_db(&cq->dbinfo, DBC_DBC_TYPE_CQ_ARMENA);
+
 	return 0;
 
 fail:
-	bnxt_qplib_free_hwq(res->pdev, &cq->hwq);
+	bnxt_qplib_free_hwq(res, &cq->hwq);
 exit:
 	return rc;
 }
@@ -1998,6 +2103,7 @@
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
 	struct cmdq_destroy_cq req;
 	struct creq_destroy_cq_resp resp;
+	u16 total_cnq_events;
 	u16 cmd_flags = 0;
 	int rc;
 
@@ -2008,27 +2114,28 @@
 					  (void *)&resp, NULL, 0);
 	if (rc)
 		return rc;
-	bnxt_qplib_free_hwq(res->pdev, &cq->hwq);
+	total_cnq_events = le16_to_cpu(resp.total_cnq_events);
+	__wait_for_all_nqes(cq, total_cnq_events);
+	bnxt_qplib_free_hwq(res, &cq->hwq);
 	return 0;
 }
 
 static int __flush_sq(struct bnxt_qplib_q *sq, struct bnxt_qplib_qp *qp,
 		      struct bnxt_qplib_cqe **pcqe, int *budget)
 {
-	u32 sw_prod, sw_cons;
 	struct bnxt_qplib_cqe *cqe;
+	u32 start, last;
 	int rc = 0;
 
 	/* Now complete all outstanding SQEs with FLUSHED_ERR */
-	sw_prod = HWQ_CMP(sq->hwq.prod, &sq->hwq);
+	start = sq->swq_start;
 	cqe = *pcqe;
 	while (*budget) {
-		sw_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
-		if (sw_cons == sw_prod) {
+		last = sq->swq_last;
+		if (start == last)
 			break;
-		}
 		/* Skip the FENCE WQE completions */
-		if (sq->swq[sw_cons].wr_id == BNXT_QPLIB_FENCE_WRID) {
+		if (sq->swq[last].wr_id == BNXT_QPLIB_FENCE_WRID) {
 			bnxt_qplib_cancel_phantom_processing(qp);
 			goto skip_compl;
 		}
@@ -2036,16 +2143,17 @@
 		cqe->status = CQ_REQ_STATUS_WORK_REQUEST_FLUSHED_ERR;
 		cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
 		cqe->qp_handle = (u64)(unsigned long)qp;
-		cqe->wr_id = sq->swq[sw_cons].wr_id;
+		cqe->wr_id = sq->swq[last].wr_id;
 		cqe->src_qp = qp->id;
-		cqe->type = sq->swq[sw_cons].type;
+		cqe->type = sq->swq[last].type;
 		cqe++;
 		(*budget)--;
 skip_compl:
-		sq->hwq.cons++;
+		bnxt_qplib_hwq_incr_cons(&sq->hwq, sq->swq[last].slots);
+		sq->swq_last = sq->swq[last].next_idx;
 	}
 	*pcqe = cqe;
-	if (!(*budget) && HWQ_CMP(sq->hwq.cons, &sq->hwq) != sw_prod)
+	if (!(*budget) && sq->swq_last != start)
 		/* Out of budget */
 		rc = -EAGAIN;
 
@@ -2056,9 +2164,9 @@
 		      struct bnxt_qplib_cqe **pcqe, int *budget)
 {
 	struct bnxt_qplib_cqe *cqe;
-	u32 sw_prod, sw_cons;
-	int rc = 0;
+	u32 start, last;
 	int opcode = 0;
+	int rc = 0;
 
 	switch (qp->type) {
 	case CMDQ_CREATE_QP1_TYPE_GSI:
@@ -2074,24 +2182,25 @@
 	}
 
 	/* Flush the rest of the RQ */
-	sw_prod = HWQ_CMP(rq->hwq.prod, &rq->hwq);
+	start = rq->swq_start;
 	cqe = *pcqe;
 	while (*budget) {
-		sw_cons = HWQ_CMP(rq->hwq.cons, &rq->hwq);
-		if (sw_cons == sw_prod)
+		last = rq->swq_last;
+		if (last == start)
 			break;
 		memset(cqe, 0, sizeof(*cqe));
 		cqe->status =
 		    CQ_RES_RC_STATUS_WORK_REQUEST_FLUSHED_ERR;
 		cqe->opcode = opcode;
 		cqe->qp_handle = (unsigned long)qp;
-		cqe->wr_id = rq->swq[sw_cons].wr_id;
+		cqe->wr_id = rq->swq[last].wr_id;
 		cqe++;
 		(*budget)--;
-		rq->hwq.cons++;
+		bnxt_qplib_hwq_incr_cons(&rq->hwq, rq->swq[last].slots);
+		rq->swq_last = rq->swq[last].next_idx;
 	}
 	*pcqe = cqe;
-	if (!*budget && HWQ_CMP(rq->hwq.cons, &rq->hwq) != sw_prod)
+	if (!*budget && rq->swq_last != start)
 		/* Out of budget */
 		rc = -EAGAIN;
 
@@ -2114,20 +2223,20 @@
  *       CQE is track from sw_cq_cons to max_element but valid only if VALID=1
  */
 static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq,
-		     u32 cq_cons, u32 sw_sq_cons, u32 cqe_sq_cons)
+		     u32 cq_cons, u32 swq_last, u32 cqe_sq_cons)
 {
-	struct bnxt_qplib_q *sq = &qp->sq;
-	struct bnxt_qplib_swq *swq;
 	u32 peek_sw_cq_cons, peek_raw_cq_cons, peek_sq_cons_idx;
-	struct cq_base *peek_hwcqe, **peek_hw_cqe_ptr;
+	struct bnxt_qplib_q *sq = &qp->sq;
 	struct cq_req *peek_req_hwcqe;
 	struct bnxt_qplib_qp *peek_qp;
 	struct bnxt_qplib_q *peek_sq;
+	struct bnxt_qplib_swq *swq;
+	struct cq_base *peek_hwcqe;
 	int i, rc = 0;
 
 	/* Normal mode */
 	/* Check for the psn_search marking before completing */
-	swq = &sq->swq[sw_sq_cons];
+	swq = &sq->swq[swq_last];
 	if (swq->psn_search &&
 	    le32_to_cpu(swq->psn_search->flags_next_psn) & 0x80000000) {
 		/* Unmark */
@@ -2136,13 +2245,12 @@
 				     & ~0x80000000);
 		dev_dbg(&cq->hwq.pdev->dev,
 			"FP: Process Req cq_cons=0x%x qp=0x%x sq cons sw=0x%x cqe=0x%x marked!\n",
-			cq_cons, qp->id, sw_sq_cons, cqe_sq_cons);
+			cq_cons, qp->id, swq_last, cqe_sq_cons);
 		sq->condition = true;
 		sq->send_phantom = true;
 
 		/* TODO: Only ARM if the previous SQE is ARMALL */
-		bnxt_qplib_arm_cq(cq, DBC_DBC_TYPE_CQ_ARMALL);
-
+		bnxt_qplib_ring_db(&cq->dbinfo, DBC_DBC_TYPE_CQ_ARMALL);
 		rc = -EAGAIN;
 		goto out;
 	}
@@ -2153,9 +2261,8 @@
 		i = cq->hwq.max_elements;
 		while (i--) {
 			peek_sw_cq_cons = HWQ_CMP((peek_sw_cq_cons), &cq->hwq);
-			peek_hw_cqe_ptr = (struct cq_base **)cq->hwq.pbl_ptr;
-			peek_hwcqe = &peek_hw_cqe_ptr[CQE_PG(peek_sw_cq_cons)]
-						     [CQE_IDX(peek_sw_cq_cons)];
+			peek_hwcqe = bnxt_qplib_get_qe(&cq->hwq,
+						       peek_sw_cq_cons, NULL);
 			/* If the next hwcqe is VALID */
 			if (CQE_CMP_VALID(peek_hwcqe, peek_raw_cq_cons,
 					  cq->hwq.max_elements)) {
@@ -2175,9 +2282,10 @@
 						 le64_to_cpu
 						 (peek_req_hwcqe->qp_handle));
 					peek_sq = &peek_qp->sq;
-					peek_sq_cons_idx = HWQ_CMP(le16_to_cpu(
-						peek_req_hwcqe->sq_cons_idx) - 1
-						, &sq->hwq);
+					peek_sq_cons_idx =
+						((le16_to_cpu(
+						  peek_req_hwcqe->sq_cons_idx)
+						  - 1) % sq->max_wqe);
 					/* If the hwcqe's sq's wr_id matches */
 					if (peek_sq == sq &&
 					    sq->swq[peek_sq_cons_idx].wr_id ==
@@ -2205,7 +2313,7 @@
 		}
 		dev_err(&cq->hwq.pdev->dev,
 			"Should not have come here! cq_cons=0x%x qp=0x%x sq cons sw=0x%x hw=0x%x\n",
-			cq_cons, qp->id, sw_sq_cons, cqe_sq_cons);
+			cq_cons, qp->id, swq_last, cqe_sq_cons);
 		rc = -EINVAL;
 	}
 out:
@@ -2217,11 +2325,11 @@
 				     struct bnxt_qplib_cqe **pcqe, int *budget,
 				     u32 cq_cons, struct bnxt_qplib_qp **lib_qp)
 {
+	struct bnxt_qplib_swq *swq;
+	struct bnxt_qplib_cqe *cqe;
 	struct bnxt_qplib_qp *qp;
 	struct bnxt_qplib_q *sq;
-	struct bnxt_qplib_cqe *cqe;
-	u32 sw_sq_cons, cqe_sq_cons;
-	struct bnxt_qplib_swq *swq;
+	u32 cqe_sq_cons;
 	int rc = 0;
 
 	qp = (struct bnxt_qplib_qp *)((unsigned long)
@@ -2233,14 +2341,7 @@
 	}
 	sq = &qp->sq;
 
-	cqe_sq_cons = HWQ_CMP(le16_to_cpu(hwcqe->sq_cons_idx), &sq->hwq);
-	if (cqe_sq_cons > sq->hwq.max_elements) {
-		dev_err(&cq->hwq.pdev->dev,
-			"FP: CQ Process req reported sq_cons_idx 0x%x which exceeded max 0x%x\n",
-			cqe_sq_cons, sq->hwq.max_elements);
-		return -EINVAL;
-	}
-
+	cqe_sq_cons = le16_to_cpu(hwcqe->sq_cons_idx) % sq->max_wqe;
 	if (qp->sq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
 			"%s: QP in Flush QP = %p\n", __func__, qp);
@@ -2252,12 +2353,11 @@
 	 */
 	cqe = *pcqe;
 	while (*budget) {
-		sw_sq_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
-		if (sw_sq_cons == cqe_sq_cons)
+		if (sq->swq_last == cqe_sq_cons)
 			/* Done */
 			break;
 
-		swq = &sq->swq[sw_sq_cons];
+		swq = &sq->swq[sq->swq_last];
 		memset(cqe, 0, sizeof(*cqe));
 		cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
 		cqe->qp_handle = (u64)(unsigned long)qp;
@@ -2271,12 +2371,12 @@
 		 * of the request being signaled or not, it must complete with
 		 * the hwcqe error status
 		 */
-		if (HWQ_CMP((sw_sq_cons + 1), &sq->hwq) == cqe_sq_cons &&
+		if (swq->next_idx == cqe_sq_cons &&
 		    hwcqe->status != CQ_REQ_STATUS_OK) {
 			cqe->status = hwcqe->status;
 			dev_err(&cq->hwq.pdev->dev,
 				"FP: CQ Processed Req wr_id[%d] = 0x%llx with status 0x%x\n",
-				sw_sq_cons, cqe->wr_id, cqe->status);
+				sq->swq_last, cqe->wr_id, cqe->status);
 			cqe++;
 			(*budget)--;
 			bnxt_qplib_mark_qp_error(qp);
@@ -2284,7 +2384,7 @@
 			bnxt_qplib_add_flush_qp(qp);
 		} else {
 			/* Before we complete, do WA 9060 */
-			if (do_wa9060(qp, cq, cq_cons, sw_sq_cons,
+			if (do_wa9060(qp, cq, cq_cons, sq->swq_last,
 				      cqe_sq_cons)) {
 				*lib_qp = qp;
 				goto out;
@@ -2296,13 +2396,14 @@
 			}
 		}
 skip:
-		sq->hwq.cons++;
+		bnxt_qplib_hwq_incr_cons(&sq->hwq, swq->slots);
+		sq->swq_last = swq->next_idx;
 		if (sq->single)
 			break;
 	}
 out:
 	*pcqe = cqe;
-	if (HWQ_CMP(sq->hwq.cons, &sq->hwq) != cqe_sq_cons) {
+	if (sq->swq_last != cqe_sq_cons) {
 		/* Out of budget */
 		rc = -EAGAIN;
 		goto done;
@@ -2331,10 +2432,10 @@
 					struct bnxt_qplib_cqe **pcqe,
 					int *budget)
 {
-	struct bnxt_qplib_qp *qp;
-	struct bnxt_qplib_q *rq;
 	struct bnxt_qplib_srq *srq;
 	struct bnxt_qplib_cqe *cqe;
+	struct bnxt_qplib_qp *qp;
+	struct bnxt_qplib_q *rq;
 	u32 wr_id_idx;
 	int rc = 0;
 
@@ -2377,17 +2478,23 @@
 		(*budget)--;
 		*pcqe = cqe;
 	} else {
+		struct bnxt_qplib_swq *swq;
+
 		rq = &qp->rq;
-		if (wr_id_idx >= rq->hwq.max_elements) {
+		if (wr_id_idx > (rq->max_wqe - 1)) {
 			dev_err(&cq->hwq.pdev->dev,
 				"FP: CQ Process RC wr_id idx 0x%x exceeded RQ max 0x%x\n",
-				wr_id_idx, rq->hwq.max_elements);
+				wr_id_idx, rq->max_wqe);
 			return -EINVAL;
 		}
-		cqe->wr_id = rq->swq[wr_id_idx].wr_id;
+		if (wr_id_idx != rq->swq_last)
+			return -EINVAL;
+		swq = &rq->swq[rq->swq_last];
+		cqe->wr_id = swq->wr_id;
 		cqe++;
 		(*budget)--;
-		rq->hwq.cons++;
+		bnxt_qplib_hwq_incr_cons(&rq->hwq, swq->slots);
+		rq->swq_last = swq->next_idx;
 		*pcqe = cqe;
 
 		if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
@@ -2406,10 +2513,10 @@
 					struct bnxt_qplib_cqe **pcqe,
 					int *budget)
 {
-	struct bnxt_qplib_qp *qp;
-	struct bnxt_qplib_q *rq;
 	struct bnxt_qplib_srq *srq;
 	struct bnxt_qplib_cqe *cqe;
+	struct bnxt_qplib_qp *qp;
+	struct bnxt_qplib_q *rq;
 	u32 wr_id_idx;
 	int rc = 0;
 
@@ -2426,7 +2533,7 @@
 	}
 	cqe = *pcqe;
 	cqe->opcode = hwcqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK;
-	cqe->length = (u32)le16_to_cpu(hwcqe->length);
+	cqe->length = le16_to_cpu(hwcqe->length) & CQ_RES_UD_LENGTH_MASK;
 	cqe->cfa_meta = le16_to_cpu(hwcqe->cfa_metadata);
 	cqe->invrkey = le32_to_cpu(hwcqe->imm_data);
 	cqe->flags = le16_to_cpu(hwcqe->flags);
@@ -2458,18 +2565,24 @@
 		(*budget)--;
 		*pcqe = cqe;
 	} else {
+		struct bnxt_qplib_swq *swq;
+
 		rq = &qp->rq;
-		if (wr_id_idx >= rq->hwq.max_elements) {
+		if (wr_id_idx > (rq->max_wqe - 1)) {
 			dev_err(&cq->hwq.pdev->dev,
 				"FP: CQ Process UD wr_id idx 0x%x exceeded RQ max 0x%x\n",
-				wr_id_idx, rq->hwq.max_elements);
+				wr_id_idx, rq->max_wqe);
 			return -EINVAL;
 		}
 
-		cqe->wr_id = rq->swq[wr_id_idx].wr_id;
+		if (rq->swq_last != wr_id_idx)
+			return -EINVAL;
+		swq = &rq->swq[rq->swq_last];
+		cqe->wr_id = swq->wr_id;
 		cqe++;
 		(*budget)--;
-		rq->hwq.cons++;
+		bnxt_qplib_hwq_incr_cons(&rq->hwq, swq->slots);
+		rq->swq_last = swq->next_idx;
 		*pcqe = cqe;
 
 		if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
@@ -2484,15 +2597,13 @@
 
 bool bnxt_qplib_is_cq_empty(struct bnxt_qplib_cq *cq)
 {
-	struct cq_base *hw_cqe, **hw_cqe_ptr;
+	struct cq_base *hw_cqe;
 	u32 sw_cons, raw_cons;
 	bool rc = true;
 
 	raw_cons = cq->hwq.cons;
 	sw_cons = HWQ_CMP(raw_cons, &cq->hwq);
-	hw_cqe_ptr = (struct cq_base **)cq->hwq.pbl_ptr;
-	hw_cqe = &hw_cqe_ptr[CQE_PG(sw_cons)][CQE_IDX(sw_cons)];
-
+	hw_cqe = bnxt_qplib_get_qe(&cq->hwq, sw_cons, NULL);
 	 /* Check for Valid bit. If the CQE is valid, return false */
 	rc = !CQE_CMP_VALID(hw_cqe, raw_cons, cq->hwq.max_elements);
 	return rc;
@@ -2562,17 +2673,23 @@
 		(*budget)--;
 		*pcqe = cqe;
 	} else {
+		struct bnxt_qplib_swq *swq;
+
 		rq = &qp->rq;
-		if (wr_id_idx >= rq->hwq.max_elements) {
+		if (wr_id_idx > (rq->max_wqe - 1)) {
 			dev_err(&cq->hwq.pdev->dev,
 				"FP: CQ Process Raw/QP1 RQ wr_id idx 0x%x exceeded RQ max 0x%x\n",
-				wr_id_idx, rq->hwq.max_elements);
+				wr_id_idx, rq->max_wqe);
 			return -EINVAL;
 		}
-		cqe->wr_id = rq->swq[wr_id_idx].wr_id;
+		if (rq->swq_last != wr_id_idx)
+			return -EINVAL;
+		swq = &rq->swq[rq->swq_last];
+		cqe->wr_id = swq->wr_id;
 		cqe++;
 		(*budget)--;
-		rq->hwq.cons++;
+		bnxt_qplib_hwq_incr_cons(&rq->hwq, swq->slots);
+		rq->swq_last = swq->next_idx;
 		*pcqe = cqe;
 
 		if (hwcqe->status != CQ_RES_RC_STATUS_OK) {
@@ -2594,7 +2711,7 @@
 	struct bnxt_qplib_qp *qp;
 	struct bnxt_qplib_q *sq, *rq;
 	struct bnxt_qplib_cqe *cqe;
-	u32 sw_cons = 0, cqe_cons;
+	u32 swq_last = 0, cqe_cons;
 	int rc = 0;
 
 	/* Check the Status */
@@ -2620,13 +2737,7 @@
 	cqe_cons = le16_to_cpu(hwcqe->sq_cons_idx);
 	if (cqe_cons == 0xFFFF)
 		goto do_rq;
-
-	if (cqe_cons > sq->hwq.max_elements) {
-		dev_err(&cq->hwq.pdev->dev,
-			"FP: CQ Process terminal reported sq_cons_idx 0x%x which exceeded max 0x%x\n",
-			cqe_cons, sq->hwq.max_elements);
-		goto do_rq;
-	}
+	cqe_cons %= sq->max_wqe;
 
 	if (qp->sq.flushed) {
 		dev_dbg(&cq->hwq.pdev->dev,
@@ -2640,24 +2751,25 @@
 	 */
 	cqe = *pcqe;
 	while (*budget) {
-		sw_cons = HWQ_CMP(sq->hwq.cons, &sq->hwq);
-		if (sw_cons == cqe_cons)
+		swq_last = sq->swq_last;
+		if (swq_last == cqe_cons)
 			break;
-		if (sq->swq[sw_cons].flags & SQ_SEND_FLAGS_SIGNAL_COMP) {
+		if (sq->swq[swq_last].flags & SQ_SEND_FLAGS_SIGNAL_COMP) {
 			memset(cqe, 0, sizeof(*cqe));
 			cqe->status = CQ_REQ_STATUS_OK;
 			cqe->opcode = CQ_BASE_CQE_TYPE_REQ;
 			cqe->qp_handle = (u64)(unsigned long)qp;
 			cqe->src_qp = qp->id;
-			cqe->wr_id = sq->swq[sw_cons].wr_id;
-			cqe->type = sq->swq[sw_cons].type;
+			cqe->wr_id = sq->swq[swq_last].wr_id;
+			cqe->type = sq->swq[swq_last].type;
 			cqe++;
 			(*budget)--;
 		}
-		sq->hwq.cons++;
+		bnxt_qplib_hwq_incr_cons(&sq->hwq, sq->swq[swq_last].slots);
+		sq->swq_last = sq->swq[swq_last].next_idx;
 	}
 	*pcqe = cqe;
-	if (!(*budget) && sw_cons != cqe_cons) {
+	if (!(*budget) && swq_last != cqe_cons) {
 		/* Out of budget */
 		rc = -EAGAIN;
 		goto sq_done;
@@ -2669,10 +2781,11 @@
 	cqe_cons = le16_to_cpu(hwcqe->rq_cons_idx);
 	if (cqe_cons == 0xFFFF) {
 		goto done;
-	} else if (cqe_cons > rq->hwq.max_elements) {
+	} else if (cqe_cons > rq->max_wqe - 1) {
 		dev_err(&cq->hwq.pdev->dev,
 			"FP: CQ Processed terminal reported rq_cons_idx 0x%x exceeds max 0x%x\n",
-			cqe_cons, rq->hwq.max_elements);
+			cqe_cons, rq->max_wqe);
+		rc = -EINVAL;
 		goto done;
 	}
 
@@ -2736,7 +2849,7 @@
 int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe,
 		       int num_cqes, struct bnxt_qplib_qp **lib_qp)
 {
-	struct cq_base *hw_cqe, **hw_cqe_ptr;
+	struct cq_base *hw_cqe;
 	u32 sw_cons, raw_cons;
 	int budget, rc = 0;
 
@@ -2745,8 +2858,7 @@
 
 	while (budget) {
 		sw_cons = HWQ_CMP(raw_cons, &cq->hwq);
-		hw_cqe_ptr = (struct cq_base **)cq->hwq.pbl_ptr;
-		hw_cqe = &hw_cqe_ptr[CQE_PG(sw_cons)][CQE_IDX(sw_cons)];
+		hw_cqe = bnxt_qplib_get_qe(&cq->hwq, sw_cons, NULL);
 
 		/* Check for Valid bit */
 		if (!CQE_CMP_VALID(hw_cqe, raw_cons, cq->hwq.max_elements))
@@ -2812,7 +2924,7 @@
 	}
 	if (cq->hwq.cons != raw_cons) {
 		cq->hwq.cons = raw_cons;
-		bnxt_qplib_arm_cq(cq, DBC_DBC_TYPE_CQ);
+		bnxt_qplib_ring_db(&cq->dbinfo, DBC_DBC_TYPE_CQ);
 	}
 exit:
 	return num_cqes - budget;
@@ -2821,7 +2933,7 @@
 void bnxt_qplib_req_notify_cq(struct bnxt_qplib_cq *cq, u32 arm_type)
 {
 	if (arm_type)
-		bnxt_qplib_arm_cq(cq, arm_type);
+		bnxt_qplib_ring_db(&cq->dbinfo, arm_type);
 	/* Using cq->arm_state variable to track whether to issue cq handler */
 	atomic_set(&cq->arm_state, 1);
 }
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
index 99e0a13..f507844 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h
@@ -39,12 +39,58 @@
 #ifndef __BNXT_QPLIB_FP_H__
 #define __BNXT_QPLIB_FP_H__
 
+/* Few helper structures temporarily defined here
+ * should get rid of these when roce_hsi.h is updated
+ * in original code base
+ */
+struct sq_ud_ext_hdr {
+	__le32 dst_qp;
+	__le32 avid;
+	__le64 rsvd;
+};
+
+struct sq_raw_ext_hdr {
+	__le32 cfa_meta;
+	__le32 rsvd0;
+	__le64 rsvd1;
+};
+
+struct sq_rdma_ext_hdr {
+	__le64 remote_va;
+	__le32 remote_key;
+	__le32 rsvd;
+};
+
+struct sq_atomic_ext_hdr {
+	__le64 swap_data;
+	__le64 cmp_data;
+};
+
+struct sq_fr_pmr_ext_hdr {
+	__le64 pblptr;
+	__le64 va;
+};
+
+struct sq_bind_ext_hdr {
+	__le64 va;
+	__le32 length_lo;
+	__le32 length_hi;
+};
+
+struct rq_ext_hdr {
+	__le64 rsvd1;
+	__le64 rsvd2;
+};
+
+/* Helper structures end */
+
 struct bnxt_qplib_srq {
 	struct bnxt_qplib_pd		*pd;
 	struct bnxt_qplib_dpi		*dpi;
-	void __iomem			*dbr_base;
+	struct bnxt_qplib_db_info	dbinfo;
 	u64				srq_handle;
 	u32				id;
+	u16				wqe_size;
 	u32				max_wqe;
 	u32				max_sge;
 	u32				threshold;
@@ -65,38 +111,7 @@
 	u32				size;
 };
 
-#define BNXT_QPLIB_MAX_SQE_ENTRY_SIZE	sizeof(struct sq_send)
-
-#define SQE_CNT_PER_PG		(PAGE_SIZE / BNXT_QPLIB_MAX_SQE_ENTRY_SIZE)
-#define SQE_MAX_IDX_PER_PG	(SQE_CNT_PER_PG - 1)
-
-static inline u32 get_sqe_pg(u32 val)
-{
-	return ((val & ~SQE_MAX_IDX_PER_PG) / SQE_CNT_PER_PG);
-}
-
-static inline u32 get_sqe_idx(u32 val)
-{
-	return (val & SQE_MAX_IDX_PER_PG);
-}
-
-#define BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE	sizeof(struct sq_psn_search)
-
-#define PSNE_CNT_PER_PG		(PAGE_SIZE / BNXT_QPLIB_MAX_PSNE_ENTRY_SIZE)
-#define PSNE_MAX_IDX_PER_PG	(PSNE_CNT_PER_PG - 1)
-
-static inline u32 get_psne_pg(u32 val)
-{
-	return ((val & ~PSNE_MAX_IDX_PER_PG) / PSNE_CNT_PER_PG);
-}
-
-static inline u32 get_psne_idx(u32 val)
-{
-	return (val & PSNE_MAX_IDX_PER_PG);
-}
-
 #define BNXT_QPLIB_QP_MAX_SGL	6
-
 struct bnxt_qplib_swq {
 	u64				wr_id;
 	int				next_idx;
@@ -104,6 +119,8 @@
 	u8				flags;
 	u32				start_psn;
 	u32				next_psn;
+	u32				slot_idx;
+	u8				slots;
 	struct sq_psn_search		*psn_search;
 	struct sq_psn_search_ext	*psn_ext;
 };
@@ -226,18 +243,13 @@
 	};
 };
 
-#define BNXT_QPLIB_MAX_RQE_ENTRY_SIZE	sizeof(struct rq_wqe)
-
-#define RQE_CNT_PER_PG		(PAGE_SIZE / BNXT_QPLIB_MAX_RQE_ENTRY_SIZE)
-#define RQE_MAX_IDX_PER_PG	(RQE_CNT_PER_PG - 1)
-#define RQE_PG(x)		(((x) & ~RQE_MAX_IDX_PER_PG) / RQE_CNT_PER_PG)
-#define RQE_IDX(x)		((x) & RQE_MAX_IDX_PER_PG)
-
 struct bnxt_qplib_q {
 	struct bnxt_qplib_hwq		hwq;
 	struct bnxt_qplib_swq		*swq;
+	struct bnxt_qplib_db_info	dbinfo;
 	struct bnxt_qplib_sg_info	sg_info;
 	u32				max_wqe;
+	u16				wqe_size;
 	u16				q_full_delta;
 	u16				max_sge;
 	u32				psn;
@@ -248,6 +260,8 @@
 	u32				phantom_cqe_cnt;
 	u32				next_cq_cons;
 	bool				flushed;
+	u32				swq_start;
+	u32				swq_last;
 };
 
 struct bnxt_qplib_qp {
@@ -255,13 +269,14 @@
 	struct bnxt_qplib_dpi		*dpi;
 	struct bnxt_qplib_chip_ctx	*cctx;
 	u64				qp_handle;
-#define        BNXT_QPLIB_QP_ID_INVALID        0xFFFFFFFF
+#define BNXT_QPLIB_QP_ID_INVALID        0xFFFFFFFF
 	u32				id;
 	u8				type;
 	u8				sig_type;
-	u32				modify_flags;
+	u8				wqe_mode;
 	u8				state;
 	u8				cur_qp_state;
+	u64				modify_flags;
 	u32				max_inline_data;
 	u32				mtu;
 	u8				path_mtu;
@@ -335,11 +350,18 @@
 	(!!((hdr)->cqe_type_toggle & CQ_BASE_TOGGLE) ==		\
 	   !((raw_cons) & (cp_bit)))
 
-static inline bool bnxt_qplib_queue_full(struct bnxt_qplib_q *qplib_q)
+static inline bool bnxt_qplib_queue_full(struct bnxt_qplib_q *que,
+					 u8 slots)
 {
-	return HWQ_CMP((qplib_q->hwq.prod + qplib_q->q_full_delta),
-		       &qplib_q->hwq) == HWQ_CMP(qplib_q->hwq.cons,
-						 &qplib_q->hwq);
+	struct bnxt_qplib_hwq *hwq;
+	int avail;
+
+	hwq = &que->hwq;
+	/* False full is possible, retrying post-send makes sense */
+	avail = hwq->cons - hwq->prod;
+	if (hwq->cons <= hwq->prod)
+		avail += hwq->depth;
+	return avail <= slots;
 }
 
 struct bnxt_qplib_cqe {
@@ -370,7 +392,7 @@
 #define BNXT_QPLIB_QUEUE_START_PERIOD		0x01
 struct bnxt_qplib_cq {
 	struct bnxt_qplib_dpi		*dpi;
-	void __iomem			*dbr_base;
+	struct bnxt_qplib_db_info	dbinfo;
 	u32				max_wqe;
 	u32				id;
 	u16				count;
@@ -401,6 +423,7 @@
  * of the same QP while manipulating the flush list.
  */
 	spinlock_t			flush_lock; /* QP flush management */
+	u16				cnq_events;
 };
 
 #define BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE	sizeof(struct xrrq_irrq)
@@ -433,66 +456,32 @@
 					 NQ_DB_IDX_VALID |	\
 					 NQ_DB_IRQ_DIS)
 
-static inline void bnxt_qplib_ring_nq_db64(void __iomem *db, u32 index,
-					   u32 xid, bool arm)
-{
-	u64 val;
+struct bnxt_qplib_nq_db {
+	struct bnxt_qplib_reg_desc	reg;
+	struct bnxt_qplib_db_info	dbinfo;
+};
 
-	val = xid & DBC_DBC_XID_MASK;
-	val |= DBC_DBC_PATH_ROCE;
-	val |= arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ;
-	val <<= 32;
-	val |= index & DBC_DBC_INDEX_MASK;
-	writeq(val, db);
-}
-
-static inline void bnxt_qplib_ring_nq_db_rearm(void __iomem *db, u32 raw_cons,
-					       u32 max_elements, u32 xid,
-					       bool gen_p5)
-{
-	u32 index = raw_cons & (max_elements - 1);
-
-	if (gen_p5)
-		bnxt_qplib_ring_nq_db64(db, index, xid, true);
-	else
-		writel(NQ_DB_CP_FLAGS_REARM | (index & DBC_DBC32_XID_MASK), db);
-}
-
-static inline void bnxt_qplib_ring_nq_db(void __iomem *db, u32 raw_cons,
-					 u32 max_elements, u32 xid,
-					 bool gen_p5)
-{
-	u32 index = raw_cons & (max_elements - 1);
-
-	if (gen_p5)
-		bnxt_qplib_ring_nq_db64(db, index, xid, false);
-	else
-		writel(NQ_DB_CP_FLAGS | (index & DBC_DBC32_XID_MASK), db);
-}
+typedef int (*cqn_handler_t)(struct bnxt_qplib_nq *nq,
+		struct bnxt_qplib_cq *cq);
+typedef int (*srqn_handler_t)(struct bnxt_qplib_nq *nq,
+		struct bnxt_qplib_srq *srq, u8 event);
 
 struct bnxt_qplib_nq {
-	struct pci_dev		*pdev;
-	struct bnxt_qplib_res	*res;
+	struct pci_dev			*pdev;
+	struct bnxt_qplib_res		*res;
+	char				name[32];
+	struct bnxt_qplib_hwq		hwq;
+	struct bnxt_qplib_nq_db		nq_db;
+	u16				ring_id;
+	int				msix_vec;
+	cpumask_t			mask;
+	struct tasklet_struct		nq_tasklet;
+	bool				requested;
+	int				budget;
 
-	int			vector;
-	cpumask_t		mask;
-	int			budget;
-	bool			requested;
-	struct tasklet_struct	worker;
-	struct bnxt_qplib_hwq	hwq;
-
-	u16			bar_reg;
-	u32			bar_reg_off;
-	u16			ring_id;
-	void __iomem		*bar_reg_iomem;
-
-	int			(*cqn_handler)(struct bnxt_qplib_nq *nq,
-					       struct bnxt_qplib_cq *cq);
-	int			(*srqn_handler)(struct bnxt_qplib_nq *nq,
-						struct bnxt_qplib_srq *srq,
-						u8 event);
-	struct workqueue_struct	*cqn_wq;
-	char			name[32];
+	cqn_handler_t			cqn_handler;
+	srqn_handler_t			srqn_handler;
+	struct workqueue_struct		*cqn_wq;
 };
 
 struct bnxt_qplib_nq_work {
@@ -507,11 +496,8 @@
 			    int msix_vector, bool need_init);
 int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq,
 			 int nq_idx, int msix_vector, int bar_reg_offset,
-			 int (*cqn_handler)(struct bnxt_qplib_nq *nq,
-					    struct bnxt_qplib_cq *cq),
-			 int (*srqn_handler)(struct bnxt_qplib_nq *nq,
-					     struct bnxt_qplib_srq *srq,
-					     u8 event));
+			 cqn_handler_t cqn_handler,
+			 srqn_handler_t srq_handler);
 int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
 			  struct bnxt_qplib_srq *srq);
 int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res,
@@ -550,7 +536,7 @@
 bool bnxt_qplib_is_cq_empty(struct bnxt_qplib_cq *cq);
 void bnxt_qplib_req_notify_cq(struct bnxt_qplib_cq *cq, u32 arm_type);
 void bnxt_qplib_free_nq(struct bnxt_qplib_nq *nq);
-int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq);
+int bnxt_qplib_alloc_nq(struct bnxt_qplib_res *res, struct bnxt_qplib_nq *nq);
 void bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp);
 void bnxt_qplib_acquire_cq_locks(struct bnxt_qplib_qp *qp,
 				 unsigned long *flags);
@@ -560,4 +546,64 @@
 				  struct bnxt_qplib_cqe *cqe,
 				  int num_cqes);
 void bnxt_qplib_flush_cqn_wq(struct bnxt_qplib_qp *qp);
+
+static inline void *bnxt_qplib_get_swqe(struct bnxt_qplib_q *que, u32 *swq_idx)
+{
+	u32 idx;
+
+	idx = que->swq_start;
+	if (swq_idx)
+		*swq_idx = idx;
+	return &que->swq[idx];
+}
+
+static inline void bnxt_qplib_swq_mod_start(struct bnxt_qplib_q *que, u32 idx)
+{
+	que->swq_start = que->swq[idx].next_idx;
+}
+
+static inline u32 bnxt_qplib_get_depth(struct bnxt_qplib_q *que)
+{
+	return (que->wqe_size * que->max_wqe) / sizeof(struct sq_sge);
+}
+
+static inline u32 bnxt_qplib_set_sq_size(struct bnxt_qplib_q *que, u8 wqe_mode)
+{
+	return (wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ?
+		que->max_wqe : bnxt_qplib_get_depth(que);
+}
+
+static inline u32 bnxt_qplib_set_sq_max_slot(u8 wqe_mode)
+{
+	return (wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ?
+		sizeof(struct sq_send) / sizeof(struct sq_sge) : 1;
+}
+
+static inline u32 bnxt_qplib_set_rq_max_slot(u32 wqe_size)
+{
+	return (wqe_size / sizeof(struct sq_sge));
+}
+
+static inline u16 __xlate_qfd(u16 delta, u16 wqe_bytes)
+{
+	/* For Cu/Wh delta = 128, stride = 16, wqe_bytes = 128
+	 * For Gen-p5 B/C mode delta = 0, stride = 16, wqe_bytes = 128.
+	 * For Gen-p5 delta = 0, stride = 16, 32 <= wqe_bytes <= 512.
+	 * when 8916 is disabled.
+	 */
+	return (delta * wqe_bytes) / sizeof(struct sq_sge);
+}
+
+static inline u16 bnxt_qplib_calc_ilsize(struct bnxt_qplib_swqe *wqe, u16 max)
+{
+	u16 size = 0;
+	int indx;
+
+	for (indx = 0; indx < wqe->num_sge; indx++)
+		size += wqe->sg_list[indx].size;
+	if (size > max)
+		size = max;
+
+	return size;
+}
 #endif /* __BNXT_QPLIB_FP_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
index 60c8f76..5759027 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
@@ -50,17 +50,19 @@
 #include "qplib_sp.h"
 #include "qplib_fp.h"
 
-static void bnxt_qplib_service_creq(unsigned long data);
+static void bnxt_qplib_service_creq(struct tasklet_struct *t);
 
 /* Hardware communication channel */
 static int __wait_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
 {
+	struct bnxt_qplib_cmdq_ctx *cmdq;
 	u16 cbit;
 	int rc;
 
+	cmdq = &rcfw->cmdq;
 	cbit = cookie % rcfw->cmdq_depth;
-	rc = wait_event_timeout(rcfw->waitq,
-				!test_bit(cbit, rcfw->cmdq_bitmap),
+	rc = wait_event_timeout(cmdq->waitq,
+				!test_bit(cbit, cmdq->cmdq_bitmap),
 				msecs_to_jiffies(RCFW_CMD_WAIT_TIME_MS));
 	return rc ? 0 : -ETIMEDOUT;
 };
@@ -68,15 +70,17 @@
 static int __block_for_resp(struct bnxt_qplib_rcfw *rcfw, u16 cookie)
 {
 	u32 count = RCFW_BLOCKED_CMD_WAIT_COUNT;
+	struct bnxt_qplib_cmdq_ctx *cmdq;
 	u16 cbit;
 
+	cmdq = &rcfw->cmdq;
 	cbit = cookie % rcfw->cmdq_depth;
-	if (!test_bit(cbit, rcfw->cmdq_bitmap))
+	if (!test_bit(cbit, cmdq->cmdq_bitmap))
 		goto done;
 	do {
 		mdelay(1); /* 1m sec */
-		bnxt_qplib_service_creq((unsigned long)rcfw);
-	} while (test_bit(cbit, rcfw->cmdq_bitmap) && --count);
+		bnxt_qplib_service_creq(&rcfw->creq.creq_tasklet);
+	} while (test_bit(cbit, cmdq->cmdq_bitmap) && --count);
 done:
 	return count ? 0 : -ETIMEDOUT;
 };
@@ -84,56 +88,59 @@
 static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req,
 			  struct creq_base *resp, void *sb, u8 is_block)
 {
-	struct bnxt_qplib_cmdqe *cmdqe, **cmdq_ptr;
-	struct bnxt_qplib_hwq *cmdq = &rcfw->cmdq;
-	u32 cmdq_depth = rcfw->cmdq_depth;
-	struct bnxt_qplib_crsq *crsqe;
+	struct bnxt_qplib_cmdq_ctx *cmdq = &rcfw->cmdq;
+	struct bnxt_qplib_hwq *hwq = &cmdq->hwq;
+	struct bnxt_qplib_crsqe *crsqe;
+	struct bnxt_qplib_cmdqe *cmdqe;
 	u32 sw_prod, cmdq_prod;
+	struct pci_dev *pdev;
 	unsigned long flags;
 	u32 size, opcode;
 	u16 cookie, cbit;
 	u8 *preq;
 
+	pdev = rcfw->pdev;
+
 	opcode = req->opcode;
-	if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) &&
+	if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) &&
 	    (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC &&
 	     opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW &&
 	     opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) {
-		dev_err(&rcfw->pdev->dev,
+		dev_err(&pdev->dev,
 			"RCFW not initialized, reject opcode 0x%x\n", opcode);
 		return -EINVAL;
 	}
 
-	if (test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) &&
+	if (test_bit(FIRMWARE_INITIALIZED_FLAG, &cmdq->flags) &&
 	    opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) {
-		dev_err(&rcfw->pdev->dev, "RCFW already initialized!\n");
+		dev_err(&pdev->dev, "RCFW already initialized!\n");
 		return -EINVAL;
 	}
 
-	if (test_bit(FIRMWARE_TIMED_OUT, &rcfw->flags))
+	if (test_bit(FIRMWARE_TIMED_OUT, &cmdq->flags))
 		return -ETIMEDOUT;
 
 	/* Cmdq are in 16-byte units, each request can consume 1 or more
 	 * cmdqe
 	 */
-	spin_lock_irqsave(&cmdq->lock, flags);
-	if (req->cmd_size >= HWQ_FREE_SLOTS(cmdq)) {
-		dev_err(&rcfw->pdev->dev, "RCFW: CMDQ is full!\n");
-		spin_unlock_irqrestore(&cmdq->lock, flags);
+	spin_lock_irqsave(&hwq->lock, flags);
+	if (req->cmd_size >= HWQ_FREE_SLOTS(hwq)) {
+		dev_err(&pdev->dev, "RCFW: CMDQ is full!\n");
+		spin_unlock_irqrestore(&hwq->lock, flags);
 		return -EAGAIN;
 	}
 
 
-	cookie = rcfw->seq_num & RCFW_MAX_COOKIE_VALUE;
+	cookie = cmdq->seq_num & RCFW_MAX_COOKIE_VALUE;
 	cbit = cookie % rcfw->cmdq_depth;
 	if (is_block)
 		cookie |= RCFW_CMD_IS_BLOCKING;
 
-	set_bit(cbit, rcfw->cmdq_bitmap);
+	set_bit(cbit, cmdq->cmdq_bitmap);
 	req->cookie = cpu_to_le16(cookie);
 	crsqe = &rcfw->crsqe_tbl[cbit];
 	if (crsqe->resp) {
-		spin_unlock_irqrestore(&cmdq->lock, flags);
+		spin_unlock_irqrestore(&hwq->lock, flags);
 		return -EBUSY;
 	}
 
@@ -155,15 +162,13 @@
 				  BNXT_QPLIB_CMDQE_UNITS;
 	}
 
-	cmdq_ptr = (struct bnxt_qplib_cmdqe **)cmdq->pbl_ptr;
 	preq = (u8 *)req;
 	do {
 		/* Locate the next cmdq slot */
-		sw_prod = HWQ_CMP(cmdq->prod, cmdq);
-		cmdqe = &cmdq_ptr[get_cmdq_pg(sw_prod, cmdq_depth)]
-				[get_cmdq_idx(sw_prod, cmdq_depth)];
+		sw_prod = HWQ_CMP(hwq->prod, hwq);
+		cmdqe = bnxt_qplib_get_qe(hwq, sw_prod, NULL);
 		if (!cmdqe) {
-			dev_err(&rcfw->pdev->dev,
+			dev_err(&pdev->dev,
 				"RCFW request failed with no cmdqe!\n");
 			goto done;
 		}
@@ -172,31 +177,27 @@
 		memcpy(cmdqe, preq, min_t(u32, size, sizeof(*cmdqe)));
 		preq += min_t(u32, size, sizeof(*cmdqe));
 		size -= min_t(u32, size, sizeof(*cmdqe));
-		cmdq->prod++;
-		rcfw->seq_num++;
+		hwq->prod++;
 	} while (size > 0);
+	cmdq->seq_num++;
 
-	rcfw->seq_num++;
-
-	cmdq_prod = cmdq->prod;
-	if (test_bit(FIRMWARE_FIRST_FLAG, &rcfw->flags)) {
+	cmdq_prod = hwq->prod;
+	if (test_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags)) {
 		/* The very first doorbell write
 		 * is required to set this flag
 		 * which prompts the FW to reset
 		 * its internal pointers
 		 */
 		cmdq_prod |= BIT(FIRMWARE_FIRST_FLAG);
-		clear_bit(FIRMWARE_FIRST_FLAG, &rcfw->flags);
+		clear_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags);
 	}
 
 	/* ring CMDQ DB */
 	wmb();
-	writel(cmdq_prod, rcfw->cmdq_bar_reg_iomem +
-	       rcfw->cmdq_bar_reg_prod_off);
-	writel(RCFW_CMDQ_TRIG_VAL, rcfw->cmdq_bar_reg_iomem +
-	       rcfw->cmdq_bar_reg_trig_off);
+	writel(cmdq_prod, cmdq->cmdq_mbox.prod);
+	writel(RCFW_CMDQ_TRIG_VAL, cmdq->cmdq_mbox.db);
 done:
-	spin_unlock_irqrestore(&cmdq->lock, flags);
+	spin_unlock_irqrestore(&hwq->lock, flags);
 	/* Return the CREQ response pointer */
 	return 0;
 }
@@ -236,7 +237,7 @@
 		/* timed out */
 		dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x timedout (%d)msec\n",
 			cookie, opcode, RCFW_CMD_WAIT_TIME_MS);
-		set_bit(FIRMWARE_TIMED_OUT, &rcfw->flags);
+		set_bit(FIRMWARE_TIMED_OUT, &rcfw->cmdq.flags);
 		return rc;
 	}
 
@@ -253,6 +254,8 @@
 static int bnxt_qplib_process_func_event(struct bnxt_qplib_rcfw *rcfw,
 					 struct creq_func_event *func_event)
 {
+	int rc;
+
 	switch (func_event->event) {
 	case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR:
 		break;
@@ -286,37 +289,42 @@
 	default:
 		return -EINVAL;
 	}
-	return 0;
+
+	rc = rcfw->creq.aeq_handler(rcfw, (void *)func_event, NULL);
+	return rc;
 }
 
 static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw,
 				       struct creq_qp_event *qp_event)
 {
-	struct bnxt_qplib_hwq *cmdq = &rcfw->cmdq;
 	struct creq_qp_error_notification *err_event;
-	struct bnxt_qplib_crsq *crsqe;
-	unsigned long flags;
+	struct bnxt_qplib_hwq *hwq = &rcfw->cmdq.hwq;
+	struct bnxt_qplib_crsqe *crsqe;
 	struct bnxt_qplib_qp *qp;
 	u16 cbit, blocked = 0;
-	u16 cookie;
+	struct pci_dev *pdev;
+	unsigned long flags;
 	__le16  mcookie;
-	u32 qp_id;
+	u16 cookie;
+	int rc = 0;
+	u32 qp_id, tbl_indx;
 
+	pdev = rcfw->pdev;
 	switch (qp_event->event) {
 	case CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION:
 		err_event = (struct creq_qp_error_notification *)qp_event;
 		qp_id = le32_to_cpu(err_event->xid);
-		qp = rcfw->qp_tbl[qp_id].qp_handle;
-		dev_dbg(&rcfw->pdev->dev,
-			"Received QP error notification\n");
-		dev_dbg(&rcfw->pdev->dev,
+		tbl_indx = map_qp_id_to_tbl_indx(qp_id, rcfw);
+		qp = rcfw->qp_tbl[tbl_indx].qp_handle;
+		dev_dbg(&pdev->dev, "Received QP error notification\n");
+		dev_dbg(&pdev->dev,
 			"qpid 0x%x, req_err=0x%x, resp_err=0x%x\n",
 			qp_id, err_event->req_err_state_reason,
 			err_event->res_err_state_reason);
 		if (!qp)
 			break;
 		bnxt_qplib_mark_qp_error(qp);
-		rcfw->aeq_handler(rcfw, qp_event, qp);
+		rc = rcfw->creq.aeq_handler(rcfw, qp_event, qp);
 		break;
 	default:
 		/*
@@ -328,7 +336,7 @@
 		 *
 		 */
 
-		spin_lock_irqsave_nested(&cmdq->lock, flags,
+		spin_lock_irqsave_nested(&hwq->lock, flags,
 					 SINGLE_DEPTH_NESTING);
 		cookie = le16_to_cpu(qp_event->cookie);
 		mcookie = qp_event->cookie;
@@ -342,44 +350,43 @@
 			crsqe->resp = NULL;
 		} else {
 			if (crsqe->resp && crsqe->resp->cookie)
-				dev_err(&rcfw->pdev->dev,
+				dev_err(&pdev->dev,
 					"CMD %s cookie sent=%#x, recd=%#x\n",
 					crsqe->resp ? "mismatch" : "collision",
 					crsqe->resp ? crsqe->resp->cookie : 0,
 					mcookie);
 		}
-		if (!test_and_clear_bit(cbit, rcfw->cmdq_bitmap))
-			dev_warn(&rcfw->pdev->dev,
+		if (!test_and_clear_bit(cbit, rcfw->cmdq.cmdq_bitmap))
+			dev_warn(&pdev->dev,
 				 "CMD bit %d was not requested\n", cbit);
-		cmdq->cons += crsqe->req_size;
+		hwq->cons += crsqe->req_size;
 		crsqe->req_size = 0;
 
 		if (!blocked)
-			wake_up(&rcfw->waitq);
-		spin_unlock_irqrestore(&cmdq->lock, flags);
+			wake_up(&rcfw->cmdq.waitq);
+		spin_unlock_irqrestore(&hwq->lock, flags);
 	}
-	return 0;
+	return rc;
 }
 
 /* SP - CREQ Completion handlers */
-static void bnxt_qplib_service_creq(unsigned long data)
+static void bnxt_qplib_service_creq(struct tasklet_struct *t)
 {
-	struct bnxt_qplib_rcfw *rcfw = (struct bnxt_qplib_rcfw *)data;
-	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
-	struct bnxt_qplib_hwq *creq = &rcfw->creq;
+	struct bnxt_qplib_rcfw *rcfw = from_tasklet(rcfw, t, creq.creq_tasklet);
+	struct bnxt_qplib_creq_ctx *creq = &rcfw->creq;
 	u32 type, budget = CREQ_ENTRY_POLL_BUDGET;
-	struct creq_base *creqe, **creq_ptr;
+	struct bnxt_qplib_hwq *hwq = &creq->hwq;
+	struct creq_base *creqe;
 	u32 sw_cons, raw_cons;
 	unsigned long flags;
 
 	/* Service the CREQ until budget is over */
-	spin_lock_irqsave(&creq->lock, flags);
-	raw_cons = creq->cons;
+	spin_lock_irqsave(&hwq->lock, flags);
+	raw_cons = hwq->cons;
 	while (budget > 0) {
-		sw_cons = HWQ_CMP(raw_cons, creq);
-		creq_ptr = (struct creq_base **)creq->pbl_ptr;
-		creqe = &creq_ptr[get_creq_pg(sw_cons)][get_creq_idx(sw_cons)];
-		if (!CREQ_CMP_VALID(creqe, raw_cons, creq->max_elements))
+		sw_cons = HWQ_CMP(raw_cons, hwq);
+		creqe = bnxt_qplib_get_qe(hwq, sw_cons, NULL);
+		if (!CREQ_CMP_VALID(creqe, raw_cons, hwq->max_elements))
 			break;
 		/* The valid test of the entry must be done first before
 		 * reading any further.
@@ -391,12 +398,12 @@
 		case CREQ_BASE_TYPE_QP_EVENT:
 			bnxt_qplib_process_qp_event
 				(rcfw, (struct creq_qp_event *)creqe);
-			rcfw->creq_qp_event_processed++;
+			creq->stats.creq_qp_event_processed++;
 			break;
 		case CREQ_BASE_TYPE_FUNC_EVENT:
 			if (!bnxt_qplib_process_func_event
 			    (rcfw, (struct creq_func_event *)creqe))
-				rcfw->creq_func_event_processed++;
+				creq->stats.creq_func_event_processed++;
 			else
 				dev_warn(&rcfw->pdev->dev,
 					 "aeqe:%#x Not handled\n", type);
@@ -412,28 +419,28 @@
 		budget--;
 	}
 
-	if (creq->cons != raw_cons) {
-		creq->cons = raw_cons;
-		bnxt_qplib_ring_creq_db_rearm(rcfw->creq_bar_reg_iomem,
-					      raw_cons, creq->max_elements,
-					      rcfw->creq_ring_id, gen_p5);
+	if (hwq->cons != raw_cons) {
+		hwq->cons = raw_cons;
+		bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo,
+				      rcfw->res->cctx, true);
 	}
-	spin_unlock_irqrestore(&creq->lock, flags);
+	spin_unlock_irqrestore(&hwq->lock, flags);
 }
 
 static irqreturn_t bnxt_qplib_creq_irq(int irq, void *dev_instance)
 {
 	struct bnxt_qplib_rcfw *rcfw = dev_instance;
-	struct bnxt_qplib_hwq *creq = &rcfw->creq;
-	struct creq_base **creq_ptr;
+	struct bnxt_qplib_creq_ctx *creq;
+	struct bnxt_qplib_hwq *hwq;
 	u32 sw_cons;
 
+	creq = &rcfw->creq;
+	hwq = &creq->hwq;
 	/* Prefetch the CREQ element */
-	sw_cons = HWQ_CMP(creq->cons, creq);
-	creq_ptr = (struct creq_base **)rcfw->creq.pbl_ptr;
-	prefetch(&creq_ptr[get_creq_pg(sw_cons)][get_creq_idx(sw_cons)]);
+	sw_cons = HWQ_CMP(hwq->cons, hwq);
+	prefetch(bnxt_qplib_get_qe(hwq, sw_cons, NULL));
 
-	tasklet_schedule(&rcfw->worker);
+	tasklet_schedule(&creq->creq_tasklet);
 
 	return IRQ_HANDLED;
 }
@@ -452,33 +459,17 @@
 	if (rc)
 		return rc;
 
-	clear_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags);
+	clear_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags);
 	return 0;
 }
 
-static int __get_pbl_pg_idx(struct bnxt_qplib_pbl *pbl)
-{
-	return (pbl->pg_size == ROCE_PG_SIZE_4K ?
-				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_4K :
-		pbl->pg_size == ROCE_PG_SIZE_8K ?
-				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_8K :
-		pbl->pg_size == ROCE_PG_SIZE_64K ?
-				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_64K :
-		pbl->pg_size == ROCE_PG_SIZE_2M ?
-				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_2M :
-		pbl->pg_size == ROCE_PG_SIZE_8M ?
-				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_8M :
-		pbl->pg_size == ROCE_PG_SIZE_1G ?
-				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_1G :
-				      CMDQ_INITIALIZE_FW_QPC_PG_SIZE_PG_4K);
-}
-
 int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
 			 struct bnxt_qplib_ctx *ctx, int is_virtfn)
 {
-	struct cmdq_initialize_fw req;
 	struct creq_initialize_fw_resp resp;
-	u16 cmd_flags = 0, level;
+	struct cmdq_initialize_fw req;
+	u16 cmd_flags = 0;
+	u8 pgsz, lvl;
 	int rc;
 
 	RCFW_CMD_PREP(req, INITIALIZE_FW, cmd_flags);
@@ -494,34 +485,35 @@
 	 * shall setup this area for VF. Skipping the
 	 * HW programming
 	 */
-	if (is_virtfn || bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx))
+	if (is_virtfn)
 		goto skip_ctx_setup;
+	if (bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx))
+		goto config_vf_res;
 
-	level = ctx->qpc_tbl.level;
-	req.qpc_pg_size_qpc_lvl = (level << CMDQ_INITIALIZE_FW_QPC_LVL_SFT) |
-				__get_pbl_pg_idx(&ctx->qpc_tbl.pbl[level]);
-	level = ctx->mrw_tbl.level;
-	req.mrw_pg_size_mrw_lvl = (level << CMDQ_INITIALIZE_FW_MRW_LVL_SFT) |
-				__get_pbl_pg_idx(&ctx->mrw_tbl.pbl[level]);
-	level = ctx->srqc_tbl.level;
-	req.srq_pg_size_srq_lvl = (level << CMDQ_INITIALIZE_FW_SRQ_LVL_SFT) |
-				__get_pbl_pg_idx(&ctx->srqc_tbl.pbl[level]);
-	level = ctx->cq_tbl.level;
-	req.cq_pg_size_cq_lvl = (level << CMDQ_INITIALIZE_FW_CQ_LVL_SFT) |
-				__get_pbl_pg_idx(&ctx->cq_tbl.pbl[level]);
-	level = ctx->srqc_tbl.level;
-	req.srq_pg_size_srq_lvl = (level << CMDQ_INITIALIZE_FW_SRQ_LVL_SFT) |
-				__get_pbl_pg_idx(&ctx->srqc_tbl.pbl[level]);
-	level = ctx->cq_tbl.level;
-	req.cq_pg_size_cq_lvl = (level << CMDQ_INITIALIZE_FW_CQ_LVL_SFT) |
-				__get_pbl_pg_idx(&ctx->cq_tbl.pbl[level]);
-	level = ctx->tim_tbl.level;
-	req.tim_pg_size_tim_lvl = (level << CMDQ_INITIALIZE_FW_TIM_LVL_SFT) |
-				  __get_pbl_pg_idx(&ctx->tim_tbl.pbl[level]);
-	level = ctx->tqm_pde_level;
-	req.tqm_pg_size_tqm_lvl = (level << CMDQ_INITIALIZE_FW_TQM_LVL_SFT) |
-				  __get_pbl_pg_idx(&ctx->tqm_pde.pbl[level]);
-
+	lvl = ctx->qpc_tbl.level;
+	pgsz = bnxt_qplib_base_pg_size(&ctx->qpc_tbl);
+	req.qpc_pg_size_qpc_lvl = (pgsz << CMDQ_INITIALIZE_FW_QPC_PG_SIZE_SFT) |
+				   lvl;
+	lvl = ctx->mrw_tbl.level;
+	pgsz = bnxt_qplib_base_pg_size(&ctx->mrw_tbl);
+	req.mrw_pg_size_mrw_lvl = (pgsz << CMDQ_INITIALIZE_FW_QPC_PG_SIZE_SFT) |
+				   lvl;
+	lvl = ctx->srqc_tbl.level;
+	pgsz = bnxt_qplib_base_pg_size(&ctx->srqc_tbl);
+	req.srq_pg_size_srq_lvl = (pgsz << CMDQ_INITIALIZE_FW_QPC_PG_SIZE_SFT) |
+				   lvl;
+	lvl = ctx->cq_tbl.level;
+	pgsz = bnxt_qplib_base_pg_size(&ctx->cq_tbl);
+	req.cq_pg_size_cq_lvl = (pgsz << CMDQ_INITIALIZE_FW_QPC_PG_SIZE_SFT) |
+				 lvl;
+	lvl = ctx->tim_tbl.level;
+	pgsz = bnxt_qplib_base_pg_size(&ctx->tim_tbl);
+	req.tim_pg_size_tim_lvl = (pgsz << CMDQ_INITIALIZE_FW_QPC_PG_SIZE_SFT) |
+				   lvl;
+	lvl = ctx->tqm_ctx.pde.level;
+	pgsz = bnxt_qplib_base_pg_size(&ctx->tqm_ctx.pde);
+	req.tqm_pg_size_tqm_lvl = (pgsz << CMDQ_INITIALIZE_FW_QPC_PG_SIZE_SFT) |
+				   lvl;
 	req.qpc_page_dir =
 		cpu_to_le64(ctx->qpc_tbl.pbl[PBL_LVL_0].pg_map_arr[0]);
 	req.mrw_page_dir =
@@ -533,13 +525,14 @@
 	req.tim_page_dir =
 		cpu_to_le64(ctx->tim_tbl.pbl[PBL_LVL_0].pg_map_arr[0]);
 	req.tqm_page_dir =
-		cpu_to_le64(ctx->tqm_pde.pbl[PBL_LVL_0].pg_map_arr[0]);
+		cpu_to_le64(ctx->tqm_ctx.pde.pbl[PBL_LVL_0].pg_map_arr[0]);
 
 	req.number_of_qp = cpu_to_le32(ctx->qpc_tbl.max_elements);
 	req.number_of_mrw = cpu_to_le32(ctx->mrw_tbl.max_elements);
 	req.number_of_srq = cpu_to_le32(ctx->srqc_tbl.max_elements);
 	req.number_of_cq = cpu_to_le32(ctx->cq_tbl.max_elements);
 
+config_vf_res:
 	req.max_qp_per_vf = cpu_to_le32(ctx->vf_res.max_qp_per_vf);
 	req.max_mrw_per_vf = cpu_to_le32(ctx->vf_res.max_mrw_per_vf);
 	req.max_srq_per_vf = cpu_to_le32(ctx->vf_res.max_srq_per_vf);
@@ -552,33 +545,46 @@
 					  NULL, 0);
 	if (rc)
 		return rc;
-	set_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags);
+	set_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->cmdq.flags);
 	return 0;
 }
 
 void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
 {
+	kfree(rcfw->cmdq.cmdq_bitmap);
 	kfree(rcfw->qp_tbl);
 	kfree(rcfw->crsqe_tbl);
-	bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->cmdq);
-	bnxt_qplib_free_hwq(rcfw->pdev, &rcfw->creq);
+	bnxt_qplib_free_hwq(rcfw->res, &rcfw->cmdq.hwq);
+	bnxt_qplib_free_hwq(rcfw->res, &rcfw->creq.hwq);
 	rcfw->pdev = NULL;
 }
 
-int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
+int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res,
 				  struct bnxt_qplib_rcfw *rcfw,
 				  struct bnxt_qplib_ctx *ctx,
 				  int qp_tbl_sz)
 {
-	u8 hwq_type;
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
+	struct bnxt_qplib_sg_info sginfo = {};
+	struct bnxt_qplib_cmdq_ctx *cmdq;
+	struct bnxt_qplib_creq_ctx *creq;
+	u32 bmap_size = 0;
 
-	rcfw->pdev = pdev;
-	rcfw->creq.max_elements = BNXT_QPLIB_CREQE_MAX_CNT;
-	hwq_type = bnxt_qplib_get_hwq_type(rcfw->res);
-	if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->creq, NULL,
-				      &rcfw->creq.max_elements,
-				      BNXT_QPLIB_CREQE_UNITS,
-				      0, PAGE_SIZE, hwq_type)) {
+	rcfw->pdev = res->pdev;
+	cmdq = &rcfw->cmdq;
+	creq = &rcfw->creq;
+	rcfw->res = res;
+
+	sginfo.pgsize = PAGE_SIZE;
+	sginfo.pgshft = PAGE_SHIFT;
+
+	hwq_attr.sginfo = &sginfo;
+	hwq_attr.res = rcfw->res;
+	hwq_attr.depth = BNXT_QPLIB_CREQE_MAX_CNT;
+	hwq_attr.stride = BNXT_QPLIB_CREQE_UNITS;
+	hwq_attr.type = bnxt_qplib_get_hwq_type(res);
+
+	if (bnxt_qplib_alloc_init_hwq(&creq->hwq, &hwq_attr)) {
 		dev_err(&rcfw->pdev->dev,
 			"HW channel CREQ allocation failed\n");
 		goto fail;
@@ -588,25 +594,29 @@
 	else
 		rcfw->cmdq_depth = BNXT_QPLIB_CMDQE_MAX_CNT_8192;
 
-	rcfw->cmdq.max_elements = rcfw->cmdq_depth;
-	if (bnxt_qplib_alloc_init_hwq
-			(rcfw->pdev, &rcfw->cmdq, NULL,
-			 &rcfw->cmdq.max_elements,
-			 BNXT_QPLIB_CMDQE_UNITS, 0,
-			 bnxt_qplib_cmdqe_page_size(rcfw->cmdq_depth),
-			 HWQ_TYPE_CTX)) {
+	sginfo.pgsize = bnxt_qplib_cmdqe_page_size(rcfw->cmdq_depth);
+	hwq_attr.depth = rcfw->cmdq_depth;
+	hwq_attr.stride = BNXT_QPLIB_CMDQE_UNITS;
+	hwq_attr.type = HWQ_TYPE_CTX;
+	if (bnxt_qplib_alloc_init_hwq(&cmdq->hwq, &hwq_attr)) {
 		dev_err(&rcfw->pdev->dev,
 			"HW channel CMDQ allocation failed\n");
 		goto fail;
 	}
 
-	rcfw->crsqe_tbl = kcalloc(rcfw->cmdq.max_elements,
+	rcfw->crsqe_tbl = kcalloc(cmdq->hwq.max_elements,
 				  sizeof(*rcfw->crsqe_tbl), GFP_KERNEL);
 	if (!rcfw->crsqe_tbl)
 		goto fail;
 
-	rcfw->qp_tbl_size = qp_tbl_sz;
-	rcfw->qp_tbl = kcalloc(qp_tbl_sz, sizeof(struct bnxt_qplib_qp_node),
+	bmap_size = BITS_TO_LONGS(rcfw->cmdq_depth) * sizeof(unsigned long);
+	cmdq->cmdq_bitmap = kzalloc(bmap_size, GFP_KERNEL);
+	if (!cmdq->cmdq_bitmap)
+		goto fail;
+
+	/* Allocate one extra to hold the QP1 entries */
+	rcfw->qp_tbl_size = qp_tbl_sz + 1;
+	rcfw->qp_tbl = kcalloc(rcfw->qp_tbl_size, sizeof(struct bnxt_qplib_qp_node),
 			       GFP_KERNEL);
 	if (!rcfw->qp_tbl)
 		goto fail;
@@ -620,137 +630,198 @@
 
 void bnxt_qplib_rcfw_stop_irq(struct bnxt_qplib_rcfw *rcfw, bool kill)
 {
-	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
+	struct bnxt_qplib_creq_ctx *creq;
 
-	tasklet_disable(&rcfw->worker);
+	creq = &rcfw->creq;
+	tasklet_disable(&creq->creq_tasklet);
 	/* Mask h/w interrupts */
-	bnxt_qplib_ring_creq_db(rcfw->creq_bar_reg_iomem, rcfw->creq.cons,
-				rcfw->creq.max_elements, rcfw->creq_ring_id,
-				gen_p5);
+	bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, false);
 	/* Sync with last running IRQ-handler */
-	synchronize_irq(rcfw->vector);
+	synchronize_irq(creq->msix_vec);
 	if (kill)
-		tasklet_kill(&rcfw->worker);
+		tasklet_kill(&creq->creq_tasklet);
 
-	if (rcfw->requested) {
-		free_irq(rcfw->vector, rcfw);
-		rcfw->requested = false;
+	if (creq->requested) {
+		free_irq(creq->msix_vec, rcfw);
+		creq->requested = false;
 	}
 }
 
 void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw)
 {
+	struct bnxt_qplib_creq_ctx *creq;
+	struct bnxt_qplib_cmdq_ctx *cmdq;
 	unsigned long indx;
 
+	creq = &rcfw->creq;
+	cmdq = &rcfw->cmdq;
+	/* Make sure the HW channel is stopped! */
 	bnxt_qplib_rcfw_stop_irq(rcfw, true);
 
-	iounmap(rcfw->cmdq_bar_reg_iomem);
-	iounmap(rcfw->creq_bar_reg_iomem);
+	iounmap(cmdq->cmdq_mbox.reg.bar_reg);
+	iounmap(creq->creq_db.reg.bar_reg);
 
-	indx = find_first_bit(rcfw->cmdq_bitmap, rcfw->bmap_size);
-	if (indx != rcfw->bmap_size)
+	indx = find_first_bit(cmdq->cmdq_bitmap, rcfw->cmdq_depth);
+	if (indx != rcfw->cmdq_depth)
 		dev_err(&rcfw->pdev->dev,
 			"disabling RCFW with pending cmd-bit %lx\n", indx);
-	kfree(rcfw->cmdq_bitmap);
-	rcfw->bmap_size = 0;
 
-	rcfw->cmdq_bar_reg_iomem = NULL;
-	rcfw->creq_bar_reg_iomem = NULL;
-	rcfw->aeq_handler = NULL;
-	rcfw->vector = 0;
+	cmdq->cmdq_mbox.reg.bar_reg = NULL;
+	creq->creq_db.reg.bar_reg = NULL;
+	creq->aeq_handler = NULL;
+	creq->msix_vec = 0;
 }
 
 int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
 			      bool need_init)
 {
-	bool gen_p5 = bnxt_qplib_is_chip_gen_p5(rcfw->res->cctx);
+	struct bnxt_qplib_creq_ctx *creq;
 	int rc;
 
-	if (rcfw->requested)
+	creq = &rcfw->creq;
+
+	if (creq->requested)
 		return -EFAULT;
 
-	rcfw->vector = msix_vector;
+	creq->msix_vec = msix_vector;
 	if (need_init)
-		tasklet_init(&rcfw->worker,
-			     bnxt_qplib_service_creq, (unsigned long)rcfw);
+		tasklet_setup(&creq->creq_tasklet, bnxt_qplib_service_creq);
 	else
-		tasklet_enable(&rcfw->worker);
-	rc = request_irq(rcfw->vector, bnxt_qplib_creq_irq, 0,
+		tasklet_enable(&creq->creq_tasklet);
+	rc = request_irq(creq->msix_vec, bnxt_qplib_creq_irq, 0,
 			 "bnxt_qplib_creq", rcfw);
 	if (rc)
 		return rc;
-	rcfw->requested = true;
-	bnxt_qplib_ring_creq_db_rearm(rcfw->creq_bar_reg_iomem,
-				      rcfw->creq.cons, rcfw->creq.max_elements,
-				      rcfw->creq_ring_id, gen_p5);
+	creq->requested = true;
+
+	bnxt_qplib_ring_nq_db(&creq->creq_db.dbinfo, rcfw->res->cctx, true);
 
 	return 0;
 }
 
-int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
-				   struct bnxt_qplib_rcfw *rcfw,
+static int bnxt_qplib_map_cmdq_mbox(struct bnxt_qplib_rcfw *rcfw, bool is_vf)
+{
+	struct bnxt_qplib_cmdq_mbox *mbox;
+	resource_size_t bar_reg;
+	struct pci_dev *pdev;
+	u16 prod_offt;
+	int rc = 0;
+
+	pdev = rcfw->pdev;
+	mbox = &rcfw->cmdq.cmdq_mbox;
+
+	mbox->reg.bar_id = RCFW_COMM_PCI_BAR_REGION;
+	mbox->reg.len = RCFW_COMM_SIZE;
+	mbox->reg.bar_base = pci_resource_start(pdev, mbox->reg.bar_id);
+	if (!mbox->reg.bar_base) {
+		dev_err(&pdev->dev,
+			"QPLIB: CMDQ BAR region %d resc start is 0!\n",
+			mbox->reg.bar_id);
+		return -ENOMEM;
+	}
+
+	bar_reg = mbox->reg.bar_base + RCFW_COMM_BASE_OFFSET;
+	mbox->reg.len = RCFW_COMM_SIZE;
+	mbox->reg.bar_reg = ioremap(bar_reg, mbox->reg.len);
+	if (!mbox->reg.bar_reg) {
+		dev_err(&pdev->dev,
+			"QPLIB: CMDQ BAR region %d mapping failed\n",
+			mbox->reg.bar_id);
+		return -ENOMEM;
+	}
+
+	prod_offt = is_vf ? RCFW_VF_COMM_PROD_OFFSET :
+			    RCFW_PF_COMM_PROD_OFFSET;
+	mbox->prod = (void  __iomem *)(mbox->reg.bar_reg + prod_offt);
+	mbox->db = (void __iomem *)(mbox->reg.bar_reg + RCFW_COMM_TRIG_OFFSET);
+	return rc;
+}
+
+static int bnxt_qplib_map_creq_db(struct bnxt_qplib_rcfw *rcfw, u32 reg_offt)
+{
+	struct bnxt_qplib_creq_db *creq_db;
+	resource_size_t bar_reg;
+	struct pci_dev *pdev;
+
+	pdev = rcfw->pdev;
+	creq_db = &rcfw->creq.creq_db;
+
+	creq_db->reg.bar_id = RCFW_COMM_CONS_PCI_BAR_REGION;
+	creq_db->reg.bar_base = pci_resource_start(pdev, creq_db->reg.bar_id);
+	if (!creq_db->reg.bar_id)
+		dev_err(&pdev->dev,
+			"QPLIB: CREQ BAR region %d resc start is 0!",
+			creq_db->reg.bar_id);
+
+	bar_reg = creq_db->reg.bar_base + reg_offt;
+	/* Unconditionally map 8 bytes to support 57500 series */
+	creq_db->reg.len = 8;
+	creq_db->reg.bar_reg = ioremap(bar_reg, creq_db->reg.len);
+	if (!creq_db->reg.bar_reg) {
+		dev_err(&pdev->dev,
+			"QPLIB: CREQ BAR region %d mapping failed",
+			creq_db->reg.bar_id);
+		return -ENOMEM;
+	}
+	creq_db->dbinfo.db = creq_db->reg.bar_reg;
+	creq_db->dbinfo.hwq = &rcfw->creq.hwq;
+	creq_db->dbinfo.xid = rcfw->creq.ring_id;
+	return 0;
+}
+
+static void bnxt_qplib_start_rcfw(struct bnxt_qplib_rcfw *rcfw)
+{
+	struct bnxt_qplib_cmdq_ctx *cmdq;
+	struct bnxt_qplib_creq_ctx *creq;
+	struct bnxt_qplib_cmdq_mbox *mbox;
+	struct cmdq_init init = {0};
+
+	cmdq = &rcfw->cmdq;
+	creq = &rcfw->creq;
+	mbox = &cmdq->cmdq_mbox;
+
+	init.cmdq_pbl = cpu_to_le64(cmdq->hwq.pbl[PBL_LVL_0].pg_map_arr[0]);
+	init.cmdq_size_cmdq_lvl =
+			cpu_to_le16(((rcfw->cmdq_depth <<
+				      CMDQ_INIT_CMDQ_SIZE_SFT) &
+				    CMDQ_INIT_CMDQ_SIZE_MASK) |
+				    ((cmdq->hwq.level <<
+				      CMDQ_INIT_CMDQ_LVL_SFT) &
+				    CMDQ_INIT_CMDQ_LVL_MASK));
+	init.creq_ring_id = cpu_to_le16(creq->ring_id);
+	/* Write to the Bono mailbox register */
+	__iowrite32_copy(mbox->reg.bar_reg, &init, sizeof(init) / 4);
+}
+
+int bnxt_qplib_enable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw,
 				   int msix_vector,
 				   int cp_bar_reg_off, int virt_fn,
-				   int (*aeq_handler)(struct bnxt_qplib_rcfw *,
-						      void *, void *))
+				   aeq_handler_t aeq_handler)
 {
-	resource_size_t res_base;
-	struct cmdq_init init;
-	u16 bmap_size;
+	struct bnxt_qplib_cmdq_ctx *cmdq;
+	struct bnxt_qplib_creq_ctx *creq;
 	int rc;
 
-	/* General */
-	rcfw->seq_num = 0;
-	set_bit(FIRMWARE_FIRST_FLAG, &rcfw->flags);
-	bmap_size = BITS_TO_LONGS(rcfw->cmdq_depth) * sizeof(unsigned long);
-	rcfw->cmdq_bitmap = kzalloc(bmap_size, GFP_KERNEL);
-	if (!rcfw->cmdq_bitmap)
-		return -ENOMEM;
-	rcfw->bmap_size = bmap_size;
+	cmdq = &rcfw->cmdq;
+	creq = &rcfw->creq;
 
-	/* CMDQ */
-	rcfw->cmdq_bar_reg = RCFW_COMM_PCI_BAR_REGION;
-	res_base = pci_resource_start(pdev, rcfw->cmdq_bar_reg);
-	if (!res_base)
-		return -ENOMEM;
+	/* Clear to defaults */
 
-	rcfw->cmdq_bar_reg_iomem = ioremap_nocache(res_base +
-					      RCFW_COMM_BASE_OFFSET,
-					      RCFW_COMM_SIZE);
-	if (!rcfw->cmdq_bar_reg_iomem) {
-		dev_err(&rcfw->pdev->dev, "CMDQ BAR region %d mapping failed\n",
-			rcfw->cmdq_bar_reg);
-		return -ENOMEM;
-	}
+	cmdq->seq_num = 0;
+	set_bit(FIRMWARE_FIRST_FLAG, &cmdq->flags);
+	init_waitqueue_head(&cmdq->waitq);
 
-	rcfw->cmdq_bar_reg_prod_off = virt_fn ? RCFW_VF_COMM_PROD_OFFSET :
-					RCFW_PF_COMM_PROD_OFFSET;
+	creq->stats.creq_qp_event_processed = 0;
+	creq->stats.creq_func_event_processed = 0;
+	creq->aeq_handler = aeq_handler;
 
-	rcfw->cmdq_bar_reg_trig_off = RCFW_COMM_TRIG_OFFSET;
+	rc = bnxt_qplib_map_cmdq_mbox(rcfw, virt_fn);
+	if (rc)
+		return rc;
 
-	/* CREQ */
-	rcfw->creq_bar_reg = RCFW_COMM_CONS_PCI_BAR_REGION;
-	res_base = pci_resource_start(pdev, rcfw->creq_bar_reg);
-	if (!res_base)
-		dev_err(&rcfw->pdev->dev,
-			"CREQ BAR region %d resc start is 0!\n",
-			rcfw->creq_bar_reg);
-	/* Unconditionally map 8 bytes to support 57500 series */
-	rcfw->creq_bar_reg_iomem = ioremap_nocache(res_base + cp_bar_reg_off,
-						   8);
-	if (!rcfw->creq_bar_reg_iomem) {
-		dev_err(&rcfw->pdev->dev, "CREQ BAR region %d mapping failed\n",
-			rcfw->creq_bar_reg);
-		iounmap(rcfw->cmdq_bar_reg_iomem);
-		rcfw->cmdq_bar_reg_iomem = NULL;
-		return -ENOMEM;
-	}
-	rcfw->creq_qp_event_processed = 0;
-	rcfw->creq_func_event_processed = 0;
-
-	if (aeq_handler)
-		rcfw->aeq_handler = aeq_handler;
-	init_waitqueue_head(&rcfw->waitq);
+	rc = bnxt_qplib_map_creq_db(rcfw, cp_bar_reg_off);
+	if (rc)
+		return rc;
 
 	rc = bnxt_qplib_rcfw_start_irq(rcfw, msix_vector, true);
 	if (rc) {
@@ -760,16 +831,8 @@
 		return rc;
 	}
 
-	init.cmdq_pbl = cpu_to_le64(rcfw->cmdq.pbl[PBL_LVL_0].pg_map_arr[0]);
-	init.cmdq_size_cmdq_lvl = cpu_to_le16(
-		((rcfw->cmdq_depth << CMDQ_INIT_CMDQ_SIZE_SFT) &
-		 CMDQ_INIT_CMDQ_SIZE_MASK) |
-		((rcfw->cmdq.level << CMDQ_INIT_CMDQ_LVL_SFT) &
-		 CMDQ_INIT_CMDQ_LVL_MASK));
-	init.creq_ring_id = cpu_to_le16(rcfw->creq_ring_id);
+	bnxt_qplib_start_rcfw(rcfw);
 
-	/* Write to the Bono mailbox register */
-	__iowrite32_copy(rcfw->cmdq_bar_reg_iomem, &init, sizeof(init) / 4);
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
index dfeadc1..6953f4e 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h
@@ -87,12 +87,6 @@
 	return (bnxt_qplib_cmdqe_npages(depth) * PAGE_SIZE);
 }
 
-static inline u32 bnxt_qplib_cmdqe_cnt_per_pg(u32 depth)
-{
-	return (bnxt_qplib_cmdqe_page_size(depth) /
-		 BNXT_QPLIB_CMDQE_UNITS);
-}
-
 /* Set the cmd_size to a factor of CMDQE unit */
 static inline void bnxt_qplib_set_cmd_slots(struct cmdq_base *req)
 {
@@ -100,30 +94,12 @@
 			 BNXT_QPLIB_CMDQE_UNITS;
 }
 
-#define MAX_CMDQ_IDX(depth)		((depth) - 1)
-
-static inline u32 bnxt_qplib_max_cmdq_idx_per_pg(u32 depth)
-{
-	return (bnxt_qplib_cmdqe_cnt_per_pg(depth) - 1);
-}
-
 #define RCFW_MAX_COOKIE_VALUE		0x7FFF
 #define RCFW_CMD_IS_BLOCKING		0x8000
 #define RCFW_BLOCKED_CMD_WAIT_COUNT	0x4E20
 
 #define HWRM_VERSION_RCFW_CMDQ_DEPTH_CHECK 0x1000900020011ULL
 
-static inline u32 get_cmdq_pg(u32 val, u32 depth)
-{
-	return (val & ~(bnxt_qplib_max_cmdq_idx_per_pg(depth))) /
-		(bnxt_qplib_cmdqe_cnt_per_pg(depth));
-}
-
-static inline u32 get_cmdq_idx(u32 val, u32 depth)
-{
-	return val & (bnxt_qplib_max_cmdq_idx_per_pg(depth));
-}
-
 /* Crsq buf is 1024-Byte */
 struct bnxt_qplib_crsbe {
 	u8			data[1024];
@@ -133,81 +109,15 @@
 /* Allocate 1 per QP for async error notification for now */
 #define BNXT_QPLIB_CREQE_MAX_CNT	(64 * 1024)
 #define BNXT_QPLIB_CREQE_UNITS		16	/* 16-Bytes per prod unit */
-#define BNXT_QPLIB_CREQE_CNT_PER_PG	(PAGE_SIZE / BNXT_QPLIB_CREQE_UNITS)
-
-#define MAX_CREQ_IDX			(BNXT_QPLIB_CREQE_MAX_CNT - 1)
-#define MAX_CREQ_IDX_PER_PG		(BNXT_QPLIB_CREQE_CNT_PER_PG - 1)
-
-static inline u32 get_creq_pg(u32 val)
-{
-	return (val & ~MAX_CREQ_IDX_PER_PG) / BNXT_QPLIB_CREQE_CNT_PER_PG;
-}
-
-static inline u32 get_creq_idx(u32 val)
-{
-	return val & MAX_CREQ_IDX_PER_PG;
-}
-
-#define BNXT_QPLIB_CREQE_PER_PG	(PAGE_SIZE / sizeof(struct creq_base))
-
 #define CREQ_CMP_VALID(hdr, raw_cons, cp_bit)			\
 	(!!((hdr)->v & CREQ_BASE_V) ==				\
 	   !((raw_cons) & (cp_bit)))
-
-#define CREQ_DB_KEY_CP			(0x2 << CMPL_DOORBELL_KEY_SFT)
-#define CREQ_DB_IDX_VALID		CMPL_DOORBELL_IDX_VALID
-#define CREQ_DB_IRQ_DIS			CMPL_DOORBELL_MASK
-#define CREQ_DB_CP_FLAGS_REARM		(CREQ_DB_KEY_CP |	\
-					 CREQ_DB_IDX_VALID)
-#define CREQ_DB_CP_FLAGS		(CREQ_DB_KEY_CP |	\
-					 CREQ_DB_IDX_VALID |	\
-					 CREQ_DB_IRQ_DIS)
-
-static inline void bnxt_qplib_ring_creq_db64(void __iomem *db, u32 index,
-					     u32 xid, bool arm)
-{
-	u64 val = 0;
-
-	val = xid & DBC_DBC_XID_MASK;
-	val |= DBC_DBC_PATH_ROCE;
-	val |= arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ;
-	val <<= 32;
-	val |= index & DBC_DBC_INDEX_MASK;
-
-	writeq(val, db);
-}
-
-static inline void bnxt_qplib_ring_creq_db_rearm(void __iomem *db, u32 raw_cons,
-						 u32 max_elements, u32 xid,
-						 bool gen_p5)
-{
-	u32 index = raw_cons & (max_elements - 1);
-
-	if (gen_p5)
-		bnxt_qplib_ring_creq_db64(db, index, xid, true);
-	else
-		writel(CREQ_DB_CP_FLAGS_REARM | (index & DBC_DBC32_XID_MASK),
-		       db);
-}
-
-static inline void bnxt_qplib_ring_creq_db(void __iomem *db, u32 raw_cons,
-					   u32 max_elements, u32 xid,
-					   bool gen_p5)
-{
-	u32 index = raw_cons & (max_elements - 1);
-
-	if (gen_p5)
-		bnxt_qplib_ring_creq_db64(db, index, xid, true);
-	else
-		writel(CREQ_DB_CP_FLAGS | (index & DBC_DBC32_XID_MASK),
-		       db);
-}
-
 #define CREQ_ENTRY_POLL_BUDGET		0x100
 
 /* HWQ */
+typedef int (*aeq_handler_t)(struct bnxt_qplib_rcfw *, void *, void *);
 
-struct bnxt_qplib_crsq {
+struct bnxt_qplib_crsqe {
 	struct creq_qp_event	*resp;
 	u32			req_size;
 };
@@ -225,41 +135,52 @@
 
 #define BNXT_QPLIB_OOS_COUNT_MASK 0xFFFFFFFF
 
+#define FIRMWARE_INITIALIZED_FLAG	(0)
+#define FIRMWARE_FIRST_FLAG		(31)
+#define FIRMWARE_TIMED_OUT		(3)
+struct bnxt_qplib_cmdq_mbox {
+	struct bnxt_qplib_reg_desc	reg;
+	void __iomem			*prod;
+	void __iomem			*db;
+};
+
+struct bnxt_qplib_cmdq_ctx {
+	struct bnxt_qplib_hwq		hwq;
+	struct bnxt_qplib_cmdq_mbox	cmdq_mbox;
+	wait_queue_head_t		waitq;
+	unsigned long			flags;
+	unsigned long			*cmdq_bitmap;
+	u32				seq_num;
+};
+
+struct bnxt_qplib_creq_db {
+	struct bnxt_qplib_reg_desc	reg;
+	struct bnxt_qplib_db_info	dbinfo;
+};
+
+struct bnxt_qplib_creq_stat {
+	u64	creq_qp_event_processed;
+	u64	creq_func_event_processed;
+};
+
+struct bnxt_qplib_creq_ctx {
+	struct bnxt_qplib_hwq		hwq;
+	struct bnxt_qplib_creq_db	creq_db;
+	struct bnxt_qplib_creq_stat	stats;
+	struct tasklet_struct		creq_tasklet;
+	aeq_handler_t			aeq_handler;
+	u16				ring_id;
+	int				msix_vec;
+	bool				requested; /*irq handler installed */
+};
+
 /* RCFW Communication Channels */
 struct bnxt_qplib_rcfw {
 	struct pci_dev		*pdev;
 	struct bnxt_qplib_res	*res;
-	int			vector;
-	struct tasklet_struct	worker;
-	bool			requested;
-	unsigned long		*cmdq_bitmap;
-	u32			bmap_size;
-	unsigned long		flags;
-#define FIRMWARE_INITIALIZED_FLAG	0
-#define FIRMWARE_FIRST_FLAG		31
-#define FIRMWARE_TIMED_OUT		3
-	wait_queue_head_t	waitq;
-	int			(*aeq_handler)(struct bnxt_qplib_rcfw *,
-					       void *, void *);
-	u32			seq_num;
-
-	/* Bar region info */
-	void __iomem		*cmdq_bar_reg_iomem;
-	u16			cmdq_bar_reg;
-	u16			cmdq_bar_reg_prod_off;
-	u16			cmdq_bar_reg_trig_off;
-	u16			creq_ring_id;
-	u16			creq_bar_reg;
-	void __iomem		*creq_bar_reg_iomem;
-
-	/* Cmd-Resp and Async Event notification queue */
-	struct bnxt_qplib_hwq	creq;
-	u64			creq_qp_event_processed;
-	u64			creq_func_event_processed;
-
-	/* Actual Cmd and Resp Queues */
-	struct bnxt_qplib_hwq	cmdq;
-	struct bnxt_qplib_crsq	*crsqe_tbl;
+	struct bnxt_qplib_cmdq_ctx	cmdq;
+	struct bnxt_qplib_creq_ctx	creq;
+	struct bnxt_qplib_crsqe		*crsqe_tbl;
 	int qp_tbl_size;
 	struct bnxt_qplib_qp_node *qp_tbl;
 	u64 oos_prev;
@@ -268,7 +189,7 @@
 };
 
 void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
-int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
+int bnxt_qplib_alloc_rcfw_channel(struct bnxt_qplib_res *res,
 				  struct bnxt_qplib_rcfw *rcfw,
 				  struct bnxt_qplib_ctx *ctx,
 				  int qp_tbl_sz);
@@ -276,12 +197,10 @@
 void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw);
 int bnxt_qplib_rcfw_start_irq(struct bnxt_qplib_rcfw *rcfw, int msix_vector,
 			      bool need_init);
-int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev,
-				   struct bnxt_qplib_rcfw *rcfw,
+int bnxt_qplib_enable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw,
 				   int msix_vector,
 				   int cp_bar_reg_off, int virt_fn,
-				   int (*aeq_handler)(struct bnxt_qplib_rcfw *,
-						      void *aeqe, void *obj));
+				   aeq_handler_t aeq_handler);
 
 struct bnxt_qplib_rcfw_sbuf *bnxt_qplib_rcfw_alloc_sbuf(
 				struct bnxt_qplib_rcfw *rcfw,
@@ -296,4 +215,9 @@
 int bnxt_qplib_init_rcfw(struct bnxt_qplib_rcfw *rcfw,
 			 struct bnxt_qplib_ctx *ctx, int is_virtfn);
 void bnxt_qplib_mark_qp_error(void *qp_handle);
+static inline u32 map_qp_id_to_tbl_indx(u32 qid, struct bnxt_qplib_rcfw *rcfw)
+{
+	/* Last index of the qp_tbl is for QP1 ie. qp_tbl_size - 1*/
+	return (qid == 1) ? rcfw->qp_tbl_size - 1 : qid % rcfw->qp_tbl_size - 2;
+}
 #endif /* __BNXT_QPLIB_RCFW_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c
index 4041b35..754dceb 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c
@@ -44,6 +44,10 @@
 #include <linux/inetdevice.h>
 #include <linux/dma-mapping.h>
 #include <linux/if_vlan.h>
+#include <linux/vmalloc.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+
 #include "roce_hsi.h"
 #include "qplib_res.h"
 #include "qplib_sp.h"
@@ -52,12 +56,14 @@
 static void bnxt_qplib_free_stats_ctx(struct pci_dev *pdev,
 				      struct bnxt_qplib_stats *stats);
 static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev,
+				      struct bnxt_qplib_chip_ctx *cctx,
 				      struct bnxt_qplib_stats *stats);
 
 /* PBL */
-static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
+static void __free_pbl(struct bnxt_qplib_res *res, struct bnxt_qplib_pbl *pbl,
 		       bool is_umem)
 {
+	struct pci_dev *pdev = res->pdev;
 	int i;
 
 	if (!is_umem) {
@@ -74,37 +80,58 @@
 			pbl->pg_arr[i] = NULL;
 		}
 	}
-	kfree(pbl->pg_arr);
+	vfree(pbl->pg_arr);
 	pbl->pg_arr = NULL;
-	kfree(pbl->pg_map_arr);
+	vfree(pbl->pg_map_arr);
 	pbl->pg_map_arr = NULL;
 	pbl->pg_count = 0;
 	pbl->pg_size = 0;
 }
 
-static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
-		       struct scatterlist *sghead, u32 pages,
-		       u32 nmaps, u32 pg_size)
+static void bnxt_qplib_fill_user_dma_pages(struct bnxt_qplib_pbl *pbl,
+					   struct bnxt_qplib_sg_info *sginfo)
 {
-	struct sg_dma_page_iter sg_iter;
+	struct ib_block_iter biter;
+	int i = 0;
+
+	rdma_umem_for_each_dma_block(sginfo->umem, &biter, sginfo->pgsize) {
+		pbl->pg_map_arr[i] = rdma_block_iter_dma_address(&biter);
+		pbl->pg_arr[i] = NULL;
+		pbl->pg_count++;
+		i++;
+	}
+}
+
+static int __alloc_pbl(struct bnxt_qplib_res *res,
+		       struct bnxt_qplib_pbl *pbl,
+		       struct bnxt_qplib_sg_info *sginfo)
+{
+	struct pci_dev *pdev = res->pdev;
 	bool is_umem = false;
+	u32 pages;
 	int i;
 
+	if (sginfo->nopte)
+		return 0;
+	if (sginfo->umem)
+		pages = ib_umem_num_dma_blocks(sginfo->umem, sginfo->pgsize);
+	else
+		pages = sginfo->npages;
 	/* page ptr arrays */
-	pbl->pg_arr = kcalloc(pages, sizeof(void *), GFP_KERNEL);
+	pbl->pg_arr = vmalloc(pages * sizeof(void *));
 	if (!pbl->pg_arr)
 		return -ENOMEM;
 
-	pbl->pg_map_arr = kcalloc(pages, sizeof(dma_addr_t), GFP_KERNEL);
+	pbl->pg_map_arr = vmalloc(pages * sizeof(dma_addr_t));
 	if (!pbl->pg_map_arr) {
-		kfree(pbl->pg_arr);
+		vfree(pbl->pg_arr);
 		pbl->pg_arr = NULL;
 		return -ENOMEM;
 	}
 	pbl->pg_count = 0;
-	pbl->pg_size = pg_size;
+	pbl->pg_size = sginfo->pgsize;
 
-	if (!sghead) {
+	if (!sginfo->umem) {
 		for (i = 0; i < pages; i++) {
 			pbl->pg_arr[i] = dma_alloc_coherent(&pdev->dev,
 							    pbl->pg_size,
@@ -115,25 +142,19 @@
 			pbl->pg_count++;
 		}
 	} else {
-		i = 0;
 		is_umem = true;
-		for_each_sg_dma_page(sghead, &sg_iter, nmaps, 0) {
-			pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter);
-			pbl->pg_arr[i] = NULL;
-			pbl->pg_count++;
-			i++;
-		}
+		bnxt_qplib_fill_user_dma_pages(pbl, sginfo);
 	}
 
 	return 0;
-
 fail:
-	__free_pbl(pdev, pbl, is_umem);
+	__free_pbl(res, pbl, is_umem);
 	return -ENOMEM;
 }
 
 /* HWQ */
-void bnxt_qplib_free_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq)
+void bnxt_qplib_free_hwq(struct bnxt_qplib_res *res,
+			 struct bnxt_qplib_hwq *hwq)
 {
 	int i;
 
@@ -144,9 +165,9 @@
 
 	for (i = 0; i < hwq->level + 1; i++) {
 		if (i == hwq->level)
-			__free_pbl(pdev, &hwq->pbl[i], hwq->is_user);
+			__free_pbl(res, &hwq->pbl[i], hwq->is_user);
 		else
-			__free_pbl(pdev, &hwq->pbl[i], false);
+			__free_pbl(res, &hwq->pbl[i], false);
 	}
 
 	hwq->level = PBL_LVL_MAX;
@@ -158,79 +179,114 @@
 }
 
 /* All HWQs are power of 2 in size */
-int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
-			      struct bnxt_qplib_sg_info *sg_info,
-			      u32 *elements, u32 element_size, u32 aux,
-			      u32 pg_size, enum bnxt_qplib_hwq_type hwq_type)
-{
-	u32 pages, maps, slots, size, aux_pages = 0, aux_size = 0;
-	dma_addr_t *src_phys_ptr, **dst_virt_ptr;
-	struct scatterlist *sghead = NULL;
-	int i, rc;
 
+int bnxt_qplib_alloc_init_hwq(struct bnxt_qplib_hwq *hwq,
+			      struct bnxt_qplib_hwq_attr *hwq_attr)
+{
+	u32 npages, aux_slots, pg_size, aux_pages = 0, aux_size = 0;
+	struct bnxt_qplib_sg_info sginfo = {};
+	u32 depth, stride, npbl, npde;
+	dma_addr_t *src_phys_ptr, **dst_virt_ptr;
+	struct bnxt_qplib_res *res;
+	struct pci_dev *pdev;
+	int i, rc, lvl;
+
+	res = hwq_attr->res;
+	pdev = res->pdev;
+	pg_size = hwq_attr->sginfo->pgsize;
 	hwq->level = PBL_LVL_MAX;
 
-	slots = roundup_pow_of_two(*elements);
-	if (aux) {
-		aux_size = roundup_pow_of_two(aux);
-		aux_pages = (slots * aux_size) / pg_size;
-		if ((slots * aux_size) % pg_size)
+	depth = roundup_pow_of_two(hwq_attr->depth);
+	stride = roundup_pow_of_two(hwq_attr->stride);
+	if (hwq_attr->aux_depth) {
+		aux_slots = hwq_attr->aux_depth;
+		aux_size = roundup_pow_of_two(hwq_attr->aux_stride);
+		aux_pages = (aux_slots * aux_size) / pg_size;
+		if ((aux_slots * aux_size) % pg_size)
 			aux_pages++;
 	}
-	size = roundup_pow_of_two(element_size);
 
-	if (sg_info)
-		sghead = sg_info->sglist;
-
-	if (!sghead) {
+	if (!hwq_attr->sginfo->umem) {
 		hwq->is_user = false;
-		pages = (slots * size) / pg_size + aux_pages;
-		if ((slots * size) % pg_size)
-			pages++;
-		if (!pages)
+		npages = (depth * stride) / pg_size + aux_pages;
+		if ((depth * stride) % pg_size)
+			npages++;
+		if (!npages)
 			return -EINVAL;
-		maps = 0;
+		hwq_attr->sginfo->npages = npages;
 	} else {
+		unsigned long sginfo_num_pages = ib_umem_num_dma_blocks(
+			hwq_attr->sginfo->umem, hwq_attr->sginfo->pgsize);
+
 		hwq->is_user = true;
-		pages = sg_info->npages;
-		maps = sg_info->nmap;
+		npages = sginfo_num_pages;
+		npages = (npages * PAGE_SIZE) /
+			  BIT_ULL(hwq_attr->sginfo->pgshft);
+		if ((sginfo_num_pages * PAGE_SIZE) %
+		     BIT_ULL(hwq_attr->sginfo->pgshft))
+			if (!npages)
+				npages++;
 	}
 
-	/* Alloc the 1st memory block; can be a PDL/PTL/PBL */
-	if (sghead && (pages == MAX_PBL_LVL_0_PGS))
-		rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_0], sghead,
-				 pages, maps, pg_size);
-	else
-		rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_0], NULL,
-				 1, 0, pg_size);
-	if (rc)
-		goto fail;
+	if (npages == MAX_PBL_LVL_0_PGS) {
+		/* This request is Level 0, map PTE */
+		rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_0], hwq_attr->sginfo);
+		if (rc)
+			goto fail;
+		hwq->level = PBL_LVL_0;
+	}
 
-	hwq->level = PBL_LVL_0;
-
-	if (pages > MAX_PBL_LVL_0_PGS) {
-		if (pages > MAX_PBL_LVL_1_PGS) {
+	if (npages > MAX_PBL_LVL_0_PGS) {
+		if (npages > MAX_PBL_LVL_1_PGS) {
+			u32 flag = (hwq_attr->type == HWQ_TYPE_L2_CMPL) ?
+				    0 : PTU_PTE_VALID;
 			/* 2 levels of indirection */
-			rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_1], NULL,
-					 MAX_PBL_LVL_1_PGS_FOR_LVL_2,
-					 0, pg_size);
+			npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT;
+			if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT))
+				npbl++;
+			npde = npbl >> MAX_PDL_LVL_SHIFT;
+			if (npbl % BIT(MAX_PDL_LVL_SHIFT))
+				npde++;
+			/* Alloc PDE pages */
+			sginfo.pgsize = npde * pg_size;
+			sginfo.npages = 1;
+			rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_0], &sginfo);
+
+			/* Alloc PBL pages */
+			sginfo.npages = npbl;
+			sginfo.pgsize = PAGE_SIZE;
+			rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_1], &sginfo);
 			if (rc)
 				goto fail;
-			/* Fill in lvl0 PBL */
+			/* Fill PDL with PBL page pointers */
 			dst_virt_ptr =
 				(dma_addr_t **)hwq->pbl[PBL_LVL_0].pg_arr;
 			src_phys_ptr = hwq->pbl[PBL_LVL_1].pg_map_arr;
-			for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; i++)
-				dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
-					src_phys_ptr[i] | PTU_PDE_VALID;
-			hwq->level = PBL_LVL_1;
-
-			rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_2], sghead,
-					 pages, maps, pg_size);
+			if (hwq_attr->type == HWQ_TYPE_MR) {
+			/* For MR it is expected that we supply only 1 contigous
+			 * page i.e only 1 entry in the PDL that will contain
+			 * all the PBLs for the user supplied memory region
+			 */
+				for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count;
+				     i++)
+					dst_virt_ptr[0][i] = src_phys_ptr[i] |
+						flag;
+			} else {
+				for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count;
+				     i++)
+					dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
+						src_phys_ptr[i] |
+						PTU_PDE_VALID;
+			}
+			/* Alloc or init PTEs */
+			rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_2],
+					 hwq_attr->sginfo);
 			if (rc)
 				goto fail;
-
-			/* Fill in lvl1 PBL */
+			hwq->level = PBL_LVL_2;
+			if (hwq_attr->sginfo->nopte)
+				goto done;
+			/* Fill PBLs with PTE pointers */
 			dst_virt_ptr =
 				(dma_addr_t **)hwq->pbl[PBL_LVL_1].pg_arr;
 			src_phys_ptr = hwq->pbl[PBL_LVL_2].pg_map_arr;
@@ -238,7 +294,7 @@
 				dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
 					src_phys_ptr[i] | PTU_PTE_VALID;
 			}
-			if (hwq_type == HWQ_TYPE_QUEUE) {
+			if (hwq_attr->type == HWQ_TYPE_QUEUE) {
 				/* Find the last pg of the size */
 				i = hwq->pbl[PBL_LVL_2].pg_count;
 				dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |=
@@ -248,25 +304,36 @@
 						    [PTR_IDX(i - 2)] |=
 						    PTU_PTE_NEXT_TO_LAST;
 			}
-			hwq->level = PBL_LVL_2;
-		} else {
-			u32 flag = hwq_type == HWQ_TYPE_L2_CMPL ? 0 :
-						PTU_PTE_VALID;
+		} else { /* pages < 512 npbl = 1, npde = 0 */
+			u32 flag = (hwq_attr->type == HWQ_TYPE_L2_CMPL) ?
+				    0 : PTU_PTE_VALID;
 
 			/* 1 level of indirection */
-			rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_1], sghead,
-					 pages, maps, pg_size);
+			npbl = npages >> MAX_PBL_LVL_1_PGS_SHIFT;
+			if (npages % BIT(MAX_PBL_LVL_1_PGS_SHIFT))
+				npbl++;
+			sginfo.npages = npbl;
+			sginfo.pgsize = PAGE_SIZE;
+			/* Alloc PBL page */
+			rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_0], &sginfo);
 			if (rc)
 				goto fail;
-			/* Fill in lvl0 PBL */
+			/* Alloc or init  PTEs */
+			rc = __alloc_pbl(res, &hwq->pbl[PBL_LVL_1],
+					 hwq_attr->sginfo);
+			if (rc)
+				goto fail;
+			hwq->level = PBL_LVL_1;
+			if (hwq_attr->sginfo->nopte)
+				goto done;
+			/* Fill PBL with PTE pointers */
 			dst_virt_ptr =
 				(dma_addr_t **)hwq->pbl[PBL_LVL_0].pg_arr;
 			src_phys_ptr = hwq->pbl[PBL_LVL_1].pg_map_arr;
-			for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; i++) {
+			for (i = 0; i < hwq->pbl[PBL_LVL_1].pg_count; i++)
 				dst_virt_ptr[PTR_PG(i)][PTR_IDX(i)] =
 					src_phys_ptr[i] | flag;
-			}
-			if (hwq_type == HWQ_TYPE_QUEUE) {
+			if (hwq_attr->type == HWQ_TYPE_QUEUE) {
 				/* Find the last pg of the size */
 				i = hwq->pbl[PBL_LVL_1].pg_count;
 				dst_virt_ptr[PTR_PG(i - 1)][PTR_IDX(i - 1)] |=
@@ -276,42 +343,142 @@
 						    [PTR_IDX(i - 2)] |=
 						    PTU_PTE_NEXT_TO_LAST;
 			}
-			hwq->level = PBL_LVL_1;
 		}
 	}
-	hwq->pdev = pdev;
-	spin_lock_init(&hwq->lock);
+done:
 	hwq->prod = 0;
 	hwq->cons = 0;
-	*elements = hwq->max_elements = slots;
-	hwq->element_size = size;
-
+	hwq->pdev = pdev;
+	hwq->depth = hwq_attr->depth;
+	hwq->max_elements = depth;
+	hwq->element_size = stride;
+	hwq->qe_ppg = pg_size / stride;
 	/* For direct access to the elements */
-	hwq->pbl_ptr = hwq->pbl[hwq->level].pg_arr;
-	hwq->pbl_dma_ptr = hwq->pbl[hwq->level].pg_map_arr;
+	lvl = hwq->level;
+	if (hwq_attr->sginfo->nopte && hwq->level)
+		lvl = hwq->level - 1;
+	hwq->pbl_ptr = hwq->pbl[lvl].pg_arr;
+	hwq->pbl_dma_ptr = hwq->pbl[lvl].pg_map_arr;
+	spin_lock_init(&hwq->lock);
 
 	return 0;
-
 fail:
-	bnxt_qplib_free_hwq(pdev, hwq);
+	bnxt_qplib_free_hwq(res, hwq);
 	return -ENOMEM;
 }
 
 /* Context Tables */
-void bnxt_qplib_free_ctx(struct pci_dev *pdev,
+void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res,
 			 struct bnxt_qplib_ctx *ctx)
 {
 	int i;
 
-	bnxt_qplib_free_hwq(pdev, &ctx->qpc_tbl);
-	bnxt_qplib_free_hwq(pdev, &ctx->mrw_tbl);
-	bnxt_qplib_free_hwq(pdev, &ctx->srqc_tbl);
-	bnxt_qplib_free_hwq(pdev, &ctx->cq_tbl);
-	bnxt_qplib_free_hwq(pdev, &ctx->tim_tbl);
+	bnxt_qplib_free_hwq(res, &ctx->qpc_tbl);
+	bnxt_qplib_free_hwq(res, &ctx->mrw_tbl);
+	bnxt_qplib_free_hwq(res, &ctx->srqc_tbl);
+	bnxt_qplib_free_hwq(res, &ctx->cq_tbl);
+	bnxt_qplib_free_hwq(res, &ctx->tim_tbl);
 	for (i = 0; i < MAX_TQM_ALLOC_REQ; i++)
-		bnxt_qplib_free_hwq(pdev, &ctx->tqm_tbl[i]);
-	bnxt_qplib_free_hwq(pdev, &ctx->tqm_pde);
-	bnxt_qplib_free_stats_ctx(pdev, &ctx->stats);
+		bnxt_qplib_free_hwq(res, &ctx->tqm_ctx.qtbl[i]);
+	/* restore original pde level before destroy */
+	ctx->tqm_ctx.pde.level = ctx->tqm_ctx.pde_level;
+	bnxt_qplib_free_hwq(res, &ctx->tqm_ctx.pde);
+	bnxt_qplib_free_stats_ctx(res->pdev, &ctx->stats);
+}
+
+static int bnxt_qplib_alloc_tqm_rings(struct bnxt_qplib_res *res,
+				      struct bnxt_qplib_ctx *ctx)
+{
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
+	struct bnxt_qplib_sg_info sginfo = {};
+	struct bnxt_qplib_tqm_ctx *tqmctx;
+	int rc = 0;
+	int i;
+
+	tqmctx = &ctx->tqm_ctx;
+
+	sginfo.pgsize = PAGE_SIZE;
+	sginfo.pgshft = PAGE_SHIFT;
+	hwq_attr.sginfo = &sginfo;
+	hwq_attr.res = res;
+	hwq_attr.type = HWQ_TYPE_CTX;
+	hwq_attr.depth = 512;
+	hwq_attr.stride = sizeof(u64);
+	/* Alloc pdl buffer */
+	rc = bnxt_qplib_alloc_init_hwq(&tqmctx->pde, &hwq_attr);
+	if (rc)
+		goto out;
+	/* Save original pdl level */
+	tqmctx->pde_level = tqmctx->pde.level;
+
+	hwq_attr.stride = 1;
+	for (i = 0; i < MAX_TQM_ALLOC_REQ; i++) {
+		if (!tqmctx->qcount[i])
+			continue;
+		hwq_attr.depth = ctx->qpc_count * tqmctx->qcount[i];
+		rc = bnxt_qplib_alloc_init_hwq(&tqmctx->qtbl[i], &hwq_attr);
+		if (rc)
+			goto out;
+	}
+out:
+	return rc;
+}
+
+static void bnxt_qplib_map_tqm_pgtbl(struct bnxt_qplib_tqm_ctx *ctx)
+{
+	struct bnxt_qplib_hwq *tbl;
+	dma_addr_t *dma_ptr;
+	__le64 **pbl_ptr, *ptr;
+	int i, j, k;
+	int fnz_idx = -1;
+	int pg_count;
+
+	pbl_ptr = (__le64 **)ctx->pde.pbl_ptr;
+
+	for (i = 0, j = 0; i < MAX_TQM_ALLOC_REQ;
+	     i++, j += MAX_TQM_ALLOC_BLK_SIZE) {
+		tbl = &ctx->qtbl[i];
+		if (!tbl->max_elements)
+			continue;
+		if (fnz_idx == -1)
+			fnz_idx = i; /* first non-zero index */
+		switch (tbl->level) {
+		case PBL_LVL_2:
+			pg_count = tbl->pbl[PBL_LVL_1].pg_count;
+			for (k = 0; k < pg_count; k++) {
+				ptr = &pbl_ptr[PTR_PG(j + k)][PTR_IDX(j + k)];
+				dma_ptr = &tbl->pbl[PBL_LVL_1].pg_map_arr[k];
+				*ptr = cpu_to_le64(*dma_ptr | PTU_PTE_VALID);
+			}
+			break;
+		case PBL_LVL_1:
+		case PBL_LVL_0:
+		default:
+			ptr = &pbl_ptr[PTR_PG(j)][PTR_IDX(j)];
+			*ptr = cpu_to_le64(tbl->pbl[PBL_LVL_0].pg_map_arr[0] |
+					   PTU_PTE_VALID);
+			break;
+		}
+	}
+	if (fnz_idx == -1)
+		fnz_idx = 0;
+	/* update pde level as per page table programming */
+	ctx->pde.level = (ctx->qtbl[fnz_idx].level == PBL_LVL_2) ? PBL_LVL_2 :
+			  ctx->qtbl[fnz_idx].level + 1;
+}
+
+static int bnxt_qplib_setup_tqm_rings(struct bnxt_qplib_res *res,
+				      struct bnxt_qplib_ctx *ctx)
+{
+	int rc = 0;
+
+	rc = bnxt_qplib_alloc_tqm_rings(res, ctx);
+	if (rc)
+		goto fail;
+
+	bnxt_qplib_map_tqm_pgtbl(&ctx->tqm_ctx);
+fail:
+	return rc;
 }
 
 /*
@@ -335,120 +502,72 @@
  * Returns:
  *     0 if success, else -ERRORS
  */
-int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
+int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res,
 			 struct bnxt_qplib_ctx *ctx,
 			 bool virt_fn, bool is_p5)
 {
-	int i, j, k, rc = 0;
-	int fnz_idx = -1;
-	__le64 **pbl_ptr;
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
+	struct bnxt_qplib_sg_info sginfo = {};
+	int rc = 0;
 
 	if (virt_fn || is_p5)
 		goto stats_alloc;
 
 	/* QPC Tables */
-	ctx->qpc_tbl.max_elements = ctx->qpc_count;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->qpc_tbl, NULL,
-				       &ctx->qpc_tbl.max_elements,
-				       BNXT_QPLIB_MAX_QP_CTX_ENTRY_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_CTX);
+	sginfo.pgsize = PAGE_SIZE;
+	sginfo.pgshft = PAGE_SHIFT;
+	hwq_attr.sginfo = &sginfo;
+
+	hwq_attr.res = res;
+	hwq_attr.depth = ctx->qpc_count;
+	hwq_attr.stride = BNXT_QPLIB_MAX_QP_CTX_ENTRY_SIZE;
+	hwq_attr.type = HWQ_TYPE_CTX;
+	rc = bnxt_qplib_alloc_init_hwq(&ctx->qpc_tbl, &hwq_attr);
 	if (rc)
 		goto fail;
 
 	/* MRW Tables */
-	ctx->mrw_tbl.max_elements = ctx->mrw_count;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->mrw_tbl, NULL,
-				       &ctx->mrw_tbl.max_elements,
-				       BNXT_QPLIB_MAX_MRW_CTX_ENTRY_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_CTX);
+	hwq_attr.depth = ctx->mrw_count;
+	hwq_attr.stride = BNXT_QPLIB_MAX_MRW_CTX_ENTRY_SIZE;
+	rc = bnxt_qplib_alloc_init_hwq(&ctx->mrw_tbl, &hwq_attr);
 	if (rc)
 		goto fail;
 
 	/* SRQ Tables */
-	ctx->srqc_tbl.max_elements = ctx->srqc_count;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->srqc_tbl, NULL,
-				       &ctx->srqc_tbl.max_elements,
-				       BNXT_QPLIB_MAX_SRQ_CTX_ENTRY_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_CTX);
+	hwq_attr.depth = ctx->srqc_count;
+	hwq_attr.stride = BNXT_QPLIB_MAX_SRQ_CTX_ENTRY_SIZE;
+	rc = bnxt_qplib_alloc_init_hwq(&ctx->srqc_tbl, &hwq_attr);
 	if (rc)
 		goto fail;
 
 	/* CQ Tables */
-	ctx->cq_tbl.max_elements = ctx->cq_count;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->cq_tbl, NULL,
-				       &ctx->cq_tbl.max_elements,
-				       BNXT_QPLIB_MAX_CQ_CTX_ENTRY_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_CTX);
+	hwq_attr.depth = ctx->cq_count;
+	hwq_attr.stride = BNXT_QPLIB_MAX_CQ_CTX_ENTRY_SIZE;
+	rc = bnxt_qplib_alloc_init_hwq(&ctx->cq_tbl, &hwq_attr);
 	if (rc)
 		goto fail;
 
 	/* TQM Buffer */
-	ctx->tqm_pde.max_elements = 512;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_pde, NULL,
-				       &ctx->tqm_pde.max_elements, sizeof(u64),
-				       0, PAGE_SIZE, HWQ_TYPE_CTX);
+	rc = bnxt_qplib_setup_tqm_rings(res, ctx);
 	if (rc)
 		goto fail;
-
-	for (i = 0; i < MAX_TQM_ALLOC_REQ; i++) {
-		if (!ctx->tqm_count[i])
-			continue;
-		ctx->tqm_tbl[i].max_elements = ctx->qpc_count *
-					       ctx->tqm_count[i];
-		rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_tbl[i], NULL,
-					       &ctx->tqm_tbl[i].max_elements, 1,
-					       0, PAGE_SIZE, HWQ_TYPE_CTX);
-		if (rc)
-			goto fail;
-	}
-	pbl_ptr = (__le64 **)ctx->tqm_pde.pbl_ptr;
-	for (i = 0, j = 0; i < MAX_TQM_ALLOC_REQ;
-	     i++, j += MAX_TQM_ALLOC_BLK_SIZE) {
-		if (!ctx->tqm_tbl[i].max_elements)
-			continue;
-		if (fnz_idx == -1)
-			fnz_idx = i;
-		switch (ctx->tqm_tbl[i].level) {
-		case PBL_LVL_2:
-			for (k = 0; k < ctx->tqm_tbl[i].pbl[PBL_LVL_1].pg_count;
-			     k++)
-				pbl_ptr[PTR_PG(j + k)][PTR_IDX(j + k)] =
-				  cpu_to_le64(
-				    ctx->tqm_tbl[i].pbl[PBL_LVL_1].pg_map_arr[k]
-				    | PTU_PTE_VALID);
-			break;
-		case PBL_LVL_1:
-		case PBL_LVL_0:
-		default:
-			pbl_ptr[PTR_PG(j)][PTR_IDX(j)] = cpu_to_le64(
-				ctx->tqm_tbl[i].pbl[PBL_LVL_0].pg_map_arr[0] |
-				PTU_PTE_VALID);
-			break;
-		}
-	}
-	if (fnz_idx == -1)
-		fnz_idx = 0;
-	ctx->tqm_pde_level = ctx->tqm_tbl[fnz_idx].level == PBL_LVL_2 ?
-			     PBL_LVL_2 : ctx->tqm_tbl[fnz_idx].level + 1;
-
 	/* TIM Buffer */
 	ctx->tim_tbl.max_elements = ctx->qpc_count * 16;
-	rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tim_tbl, NULL,
-				       &ctx->tim_tbl.max_elements, 1,
-				       0, PAGE_SIZE, HWQ_TYPE_CTX);
+	hwq_attr.depth = ctx->qpc_count * 16;
+	hwq_attr.stride = 1;
+	rc = bnxt_qplib_alloc_init_hwq(&ctx->tim_tbl, &hwq_attr);
 	if (rc)
 		goto fail;
-
 stats_alloc:
 	/* Stats */
-	rc = bnxt_qplib_alloc_stats_ctx(pdev, &ctx->stats);
+	rc = bnxt_qplib_alloc_stats_ctx(res->pdev, res->cctx, &ctx->stats);
 	if (rc)
 		goto fail;
 
 	return 0;
 
 fail:
-	bnxt_qplib_free_ctx(pdev, ctx);
+	bnxt_qplib_free_ctx(res, ctx);
 	return rc;
 }
 
@@ -704,7 +823,7 @@
 		return -ENOMEM;
 	}
 
-	dpit->dbr_bar_reg_iomem = ioremap_nocache(bar_reg_base + dbr_offset,
+	dpit->dbr_bar_reg_iomem = ioremap(bar_reg_base + dbr_offset,
 						  dbr_len);
 	if (!dpit->dbr_bar_reg_iomem) {
 		dev_err(&res->pdev->dev,
@@ -771,15 +890,12 @@
 }
 
 static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev,
+				      struct bnxt_qplib_chip_ctx *cctx,
 				      struct bnxt_qplib_stats *stats)
 {
 	memset(stats, 0, sizeof(*stats));
 	stats->fw_id = -1;
-	/* 128 byte aligned context memory is required only for 57500.
-	 * However making this unconditional, it does not harm previous
-	 * generation.
-	 */
-	stats->size = ALIGN(sizeof(struct ctx_hw_stats), 128);
+	stats->size = cctx->hw_stats_size;
 	stats->dma = dma_alloc_coherent(&pdev->dev, stats->size,
 					&stats->dma_map, GFP_KERNEL);
 	if (!stats->dma) {
@@ -809,9 +925,6 @@
 	bnxt_qplib_free_sgid_tbl(res, &res->sgid_tbl);
 	bnxt_qplib_free_pd_tbl(&res->pd_tbl);
 	bnxt_qplib_free_dpi_tbl(res, &res->dpi_tbl);
-
-	res->netdev = NULL;
-	res->pdev = NULL;
 }
 
 int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev,
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h
index aaa76d7..58bad6f 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_res.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h
@@ -41,6 +41,29 @@
 
 extern const struct bnxt_qplib_gid bnxt_qplib_gid_zero;
 
+#define CHIP_NUM_57508		0x1750
+#define CHIP_NUM_57504		0x1751
+#define CHIP_NUM_57502		0x1752
+
+enum bnxt_qplib_wqe_mode {
+	BNXT_QPLIB_WQE_MODE_STATIC	= 0x00,
+	BNXT_QPLIB_WQE_MODE_VARIABLE	= 0x01,
+	BNXT_QPLIB_WQE_MODE_INVALID	= 0x02
+};
+
+struct bnxt_qplib_drv_modes {
+	u8	wqe_mode;
+	/* Other modes to follow here */
+};
+
+struct bnxt_qplib_chip_ctx {
+	u16	chip_num;
+	u8	chip_rev;
+	u8	chip_metal;
+	u16	hw_stats_size;
+	struct bnxt_qplib_drv_modes modes;
+};
+
 #define PTR_CNT_PER_PG		(PAGE_SIZE / sizeof(void *))
 #define PTR_MAX_IDX_PER_PG	(PTR_CNT_PER_PG - 1)
 #define PTR_PG(x)		(((x) & ~PTR_MAX_IDX_PER_PG) / PTR_CNT_PER_PG)
@@ -55,7 +78,8 @@
 enum bnxt_qplib_hwq_type {
 	HWQ_TYPE_CTX,
 	HWQ_TYPE_QUEUE,
-	HWQ_TYPE_L2_CMPL
+	HWQ_TYPE_L2_CMPL,
+	HWQ_TYPE_MR
 };
 
 #define MAX_PBL_LVL_0_PGS		1
@@ -63,6 +87,7 @@
 #define MAX_PBL_LVL_1_PGS_SHIFT		9
 #define MAX_PBL_LVL_1_PGS_FOR_LVL_2	256
 #define MAX_PBL_LVL_2_PGS		(256 * 512)
+#define MAX_PDL_LVL_SHIFT               9
 
 enum bnxt_qplib_pbl_lvl {
 	PBL_LVL_0,
@@ -78,6 +103,22 @@
 #define ROCE_PG_SIZE_8M		(8 * 1024 * 1024)
 #define ROCE_PG_SIZE_1G		(1024 * 1024 * 1024)
 
+enum bnxt_qplib_hwrm_pg_size {
+	BNXT_QPLIB_HWRM_PG_SIZE_4K	= 0,
+	BNXT_QPLIB_HWRM_PG_SIZE_8K	= 1,
+	BNXT_QPLIB_HWRM_PG_SIZE_64K	= 2,
+	BNXT_QPLIB_HWRM_PG_SIZE_2M	= 3,
+	BNXT_QPLIB_HWRM_PG_SIZE_8M	= 4,
+	BNXT_QPLIB_HWRM_PG_SIZE_1G	= 5,
+};
+
+struct bnxt_qplib_reg_desc {
+	u8		bar_id;
+	resource_size_t	bar_base;
+	void __iomem	*bar_reg;
+	size_t		len;
+};
+
 struct bnxt_qplib_pbl {
 	u32				pg_count;
 	u32				pg_size;
@@ -85,23 +126,54 @@
 	dma_addr_t			*pg_map_arr;
 };
 
+struct bnxt_qplib_sg_info {
+	struct ib_umem			*umem;
+	u32				npages;
+	u32				pgshft;
+	u32				pgsize;
+	bool				nopte;
+};
+
+struct bnxt_qplib_hwq_attr {
+	struct bnxt_qplib_res		*res;
+	struct bnxt_qplib_sg_info	*sginfo;
+	enum bnxt_qplib_hwq_type	type;
+	u32				depth;
+	u32				stride;
+	u32				aux_stride;
+	u32				aux_depth;
+};
+
 struct bnxt_qplib_hwq {
 	struct pci_dev			*pdev;
 	/* lock to protect qplib_hwq */
 	spinlock_t			lock;
-	struct bnxt_qplib_pbl		pbl[PBL_LVL_MAX];
+	struct bnxt_qplib_pbl		pbl[PBL_LVL_MAX + 1];
 	enum bnxt_qplib_pbl_lvl		level;		/* 0, 1, or 2 */
 	/* ptr for easy access to the PBL entries */
 	void				**pbl_ptr;
 	/* ptr for easy access to the dma_addr */
 	dma_addr_t			*pbl_dma_ptr;
 	u32				max_elements;
+	u32				depth;
 	u16				element_size;	/* Size of each entry */
+	u16				qe_ppg;	/* queue entry per page */
 
 	u32				prod;		/* raw */
 	u32				cons;		/* raw */
 	u8				cp_bit;
 	u8				is_user;
+	u64				*pad_pg;
+	u32				pad_stride;
+	u32				pad_pgofft;
+};
+
+struct bnxt_qplib_db_info {
+	void __iomem		*db;
+	void __iomem		*priv_db;
+	struct bnxt_qplib_hwq	*hwq;
+	u32			xid;
+	u32			max_slot;
 };
 
 /* Tables */
@@ -159,6 +231,15 @@
 #define BNXT_QPLIB_MAX_CQ_CTX_ENTRY_SIZE	64
 #define BNXT_QPLIB_MAX_MRW_CTX_ENTRY_SIZE	128
 
+#define MAX_TQM_ALLOC_REQ               48
+#define MAX_TQM_ALLOC_BLK_SIZE          8
+struct bnxt_qplib_tqm_ctx {
+	struct bnxt_qplib_hwq           pde;
+	u8                              pde_level; /* Original level */
+	struct bnxt_qplib_hwq           qtbl[MAX_TQM_ALLOC_REQ];
+	u8                              qcount[MAX_TQM_ALLOC_REQ];
+};
+
 struct bnxt_qplib_ctx {
 	u32				qpc_count;
 	struct bnxt_qplib_hwq		qpc_tbl;
@@ -169,27 +250,12 @@
 	u32				cq_count;
 	struct bnxt_qplib_hwq		cq_tbl;
 	struct bnxt_qplib_hwq		tim_tbl;
-#define MAX_TQM_ALLOC_REQ		48
-#define MAX_TQM_ALLOC_BLK_SIZE		8
-	u8				tqm_count[MAX_TQM_ALLOC_REQ];
-	struct bnxt_qplib_hwq		tqm_pde;
-	u32				tqm_pde_level;
-	struct bnxt_qplib_hwq		tqm_tbl[MAX_TQM_ALLOC_REQ];
+	struct bnxt_qplib_tqm_ctx	tqm_ctx;
 	struct bnxt_qplib_stats		stats;
 	struct bnxt_qplib_vf_res	vf_res;
 	u64				hwrm_intf_ver;
 };
 
-struct bnxt_qplib_chip_ctx {
-	u16	chip_num;
-	u8	chip_rev;
-	u8	chip_metal;
-};
-
-#define CHIP_NUM_57508		0x1750
-#define CHIP_NUM_57504		0x1751
-#define CHIP_NUM_57502		0x1752
-
 struct bnxt_qplib_res {
 	struct pci_dev			*pdev;
 	struct bnxt_qplib_chip_ctx	*cctx;
@@ -223,11 +289,57 @@
 	       RING_ALLOC_REQ_RING_TYPE_ROCE_CMPL;
 }
 
-struct bnxt_qplib_sg_info {
-	struct scatterlist		*sglist;
-	u32				nmap;
-	u32				npages;
-};
+static inline u8 bnxt_qplib_base_pg_size(struct bnxt_qplib_hwq *hwq)
+{
+	u8 pg_size = BNXT_QPLIB_HWRM_PG_SIZE_4K;
+	struct bnxt_qplib_pbl *pbl;
+
+	pbl = &hwq->pbl[PBL_LVL_0];
+	switch (pbl->pg_size) {
+	case ROCE_PG_SIZE_4K:
+		pg_size = BNXT_QPLIB_HWRM_PG_SIZE_4K;
+		break;
+	case ROCE_PG_SIZE_8K:
+		pg_size = BNXT_QPLIB_HWRM_PG_SIZE_8K;
+		break;
+	case ROCE_PG_SIZE_64K:
+		pg_size = BNXT_QPLIB_HWRM_PG_SIZE_64K;
+		break;
+	case ROCE_PG_SIZE_2M:
+		pg_size = BNXT_QPLIB_HWRM_PG_SIZE_2M;
+		break;
+	case ROCE_PG_SIZE_8M:
+		pg_size = BNXT_QPLIB_HWRM_PG_SIZE_8M;
+		break;
+	case ROCE_PG_SIZE_1G:
+		pg_size = BNXT_QPLIB_HWRM_PG_SIZE_1G;
+		break;
+	default:
+		break;
+	}
+
+	return pg_size;
+}
+
+static inline void *bnxt_qplib_get_qe(struct bnxt_qplib_hwq *hwq,
+				      u32 indx, u64 *pg)
+{
+	u32 pg_num, pg_idx;
+
+	pg_num = (indx / hwq->qe_ppg);
+	pg_idx = (indx % hwq->qe_ppg);
+	if (pg)
+		*pg = (u64)&hwq->pbl_ptr[pg_num];
+	return (void *)(hwq->pbl_ptr[pg_num] + hwq->element_size * pg_idx);
+}
+
+static inline void *bnxt_qplib_get_prod_qe(struct bnxt_qplib_hwq *hwq, u32 idx)
+{
+	idx += hwq->prod;
+	if (idx >= hwq->depth)
+		idx -= hwq->depth;
+	return bnxt_qplib_get_qe(hwq, idx, NULL);
+}
 
 #define to_bnxt_qplib(ptr, type, member)	\
 	container_of(ptr, type, member)
@@ -235,11 +347,10 @@
 struct bnxt_qplib_pd;
 struct bnxt_qplib_dev_attr;
 
-void bnxt_qplib_free_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq);
-int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
-			      struct bnxt_qplib_sg_info *sg_info, u32 *elements,
-			      u32 elements_per_page, u32 aux, u32 pg_size,
-			      enum bnxt_qplib_hwq_type hwq_type);
+void bnxt_qplib_free_hwq(struct bnxt_qplib_res *res,
+			 struct bnxt_qplib_hwq *hwq);
+int bnxt_qplib_alloc_init_hwq(struct bnxt_qplib_hwq *hwq,
+			      struct bnxt_qplib_hwq_attr *hwq_attr);
 void bnxt_qplib_get_guid(u8 *dev_addr, u8 *guid);
 int bnxt_qplib_alloc_pd(struct bnxt_qplib_pd_tbl *pd_tbl,
 			struct bnxt_qplib_pd *pd);
@@ -258,9 +369,90 @@
 int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev,
 			 struct net_device *netdev,
 			 struct bnxt_qplib_dev_attr *dev_attr);
-void bnxt_qplib_free_ctx(struct pci_dev *pdev,
+void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res,
 			 struct bnxt_qplib_ctx *ctx);
-int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
+int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res,
 			 struct bnxt_qplib_ctx *ctx,
 			 bool virt_fn, bool is_p5);
+
+static inline void bnxt_qplib_hwq_incr_prod(struct bnxt_qplib_hwq *hwq, u32 cnt)
+{
+	hwq->prod = (hwq->prod + cnt) % hwq->depth;
+}
+
+static inline void bnxt_qplib_hwq_incr_cons(struct bnxt_qplib_hwq *hwq,
+					    u32 cnt)
+{
+	hwq->cons = (hwq->cons + cnt) % hwq->depth;
+}
+
+static inline void bnxt_qplib_ring_db32(struct bnxt_qplib_db_info *info,
+					bool arm)
+{
+	u32 key;
+
+	key = info->hwq->cons & (info->hwq->max_elements - 1);
+	key |= (CMPL_DOORBELL_IDX_VALID |
+		(CMPL_DOORBELL_KEY_CMPL & CMPL_DOORBELL_KEY_MASK));
+	if (!arm)
+		key |= CMPL_DOORBELL_MASK;
+	writel(key, info->db);
+}
+
+static inline void bnxt_qplib_ring_db(struct bnxt_qplib_db_info *info,
+				      u32 type)
+{
+	u64 key = 0;
+
+	key = (info->xid & DBC_DBC_XID_MASK) | DBC_DBC_PATH_ROCE | type;
+	key <<= 32;
+	key |= (info->hwq->cons & (info->hwq->max_elements - 1)) &
+		DBC_DBC_INDEX_MASK;
+	writeq(key, info->db);
+}
+
+static inline void bnxt_qplib_ring_prod_db(struct bnxt_qplib_db_info *info,
+					   u32 type)
+{
+	u64 key = 0;
+
+	key = (info->xid & DBC_DBC_XID_MASK) | DBC_DBC_PATH_ROCE | type;
+	key <<= 32;
+	key |= ((info->hwq->prod / info->max_slot)) & DBC_DBC_INDEX_MASK;
+	writeq(key, info->db);
+}
+
+static inline void bnxt_qplib_armen_db(struct bnxt_qplib_db_info *info,
+				       u32 type)
+{
+	u64 key = 0;
+
+	key = (info->xid & DBC_DBC_XID_MASK) | DBC_DBC_PATH_ROCE | type;
+	key <<= 32;
+	writeq(key, info->priv_db);
+}
+
+static inline void bnxt_qplib_srq_arm_db(struct bnxt_qplib_db_info *info,
+					 u32 th)
+{
+	u64 key = 0;
+
+	key = (info->xid & DBC_DBC_XID_MASK) | DBC_DBC_PATH_ROCE | th;
+	key <<= 32;
+	key |=  th & DBC_DBC_INDEX_MASK;
+	writeq(key, info->priv_db);
+}
+
+static inline void bnxt_qplib_ring_nq_db(struct bnxt_qplib_db_info *info,
+					 struct bnxt_qplib_chip_ctx *cctx,
+					 bool arm)
+{
+	u32 type;
+
+	type = arm ? DBC_DBC_TYPE_NQ_ARM : DBC_DBC_TYPE_NQ;
+	if (bnxt_qplib_is_chip_gen_p5(cctx))
+		bnxt_qplib_ring_db(info, type);
+	else
+		bnxt_qplib_ring_db32(info, arm);
+}
 #endif /* __BNXT_QPLIB_RES_H__ */
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
index 079aaaa..64d44f5 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c
@@ -132,9 +132,6 @@
 	attr->max_raw_ethy_qp = le32_to_cpu(sb->max_raw_eth_qp);
 	attr->max_ah = le32_to_cpu(sb->max_ah);
 
-	attr->max_fmr = le32_to_cpu(sb->max_fmr);
-	attr->max_map_per_fmr = sb->max_map_per_fmr;
-
 	attr->max_srq = le16_to_cpu(sb->max_srq);
 	attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1;
 	attr->max_srq_sges = sb->max_srq_sge;
@@ -585,7 +582,7 @@
 
 	/* Free the qplib's MRW memory */
 	if (mrw->hwq.max_elements)
-		bnxt_qplib_free_hwq(res->pdev, &mrw->hwq);
+		bnxt_qplib_free_hwq(res, &mrw->hwq);
 
 	return 0;
 }
@@ -646,7 +643,7 @@
 	if (mrw->hwq.max_elements) {
 		mrw->va = 0;
 		mrw->total_size = 0;
-		bnxt_qplib_free_hwq(res->pdev, &mrw->hwq);
+		bnxt_qplib_free_hwq(res, &mrw->hwq);
 	}
 
 	return 0;
@@ -656,10 +653,12 @@
 		      u64 *pbl_tbl, int num_pbls, bool block, u32 buf_pg_size)
 {
 	struct bnxt_qplib_rcfw *rcfw = res->rcfw;
-	struct cmdq_register_mr req;
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
+	struct bnxt_qplib_sg_info sginfo = {};
 	struct creq_register_mr_resp resp;
-	u16 cmd_flags = 0, level;
+	struct cmdq_register_mr req;
 	int pg_ptrs, pages, i, rc;
+	u16 cmd_flags = 0, level;
 	dma_addr_t **pbl_ptr;
 	u32 pg_size;
 
@@ -674,20 +673,23 @@
 
 		if (pages > MAX_PBL_LVL_1_PGS) {
 			dev_err(&res->pdev->dev,
-				"SP: Reg MR pages requested (0x%x) exceeded max (0x%x)\n",
+				"SP: Reg MR: pages requested (0x%x) exceeded max (0x%x)\n",
 				pages, MAX_PBL_LVL_1_PGS);
 			return -ENOMEM;
 		}
 		/* Free the hwq if it already exist, must be a rereg */
 		if (mr->hwq.max_elements)
-			bnxt_qplib_free_hwq(res->pdev, &mr->hwq);
-
-		mr->hwq.max_elements = pages;
+			bnxt_qplib_free_hwq(res, &mr->hwq);
 		/* Use system PAGE_SIZE */
-		rc = bnxt_qplib_alloc_init_hwq(res->pdev, &mr->hwq, NULL,
-					       &mr->hwq.max_elements,
-					       PAGE_SIZE, 0, PAGE_SIZE,
-					       HWQ_TYPE_CTX);
+		hwq_attr.res = res;
+		hwq_attr.depth = pages;
+		hwq_attr.stride = PAGE_SIZE;
+		hwq_attr.type = HWQ_TYPE_MR;
+		hwq_attr.sginfo = &sginfo;
+		hwq_attr.sginfo->npages = pages;
+		hwq_attr.sginfo->pgsize = PAGE_SIZE;
+		hwq_attr.sginfo->pgshft = PAGE_SHIFT;
+		rc = bnxt_qplib_alloc_init_hwq(&mr->hwq, &hwq_attr);
 		if (rc) {
 			dev_err(&res->pdev->dev,
 				"SP: Reg MR memory allocation failed\n");
@@ -734,7 +736,7 @@
 
 fail:
 	if (mr->hwq.max_elements)
-		bnxt_qplib_free_hwq(res->pdev, &mr->hwq);
+		bnxt_qplib_free_hwq(res, &mr->hwq);
 	return rc;
 }
 
@@ -742,6 +744,8 @@
 					struct bnxt_qplib_frpl *frpl,
 					int max_pg_ptrs)
 {
+	struct bnxt_qplib_hwq_attr hwq_attr = {};
+	struct bnxt_qplib_sg_info sginfo = {};
 	int pg_ptrs, pages, rc;
 
 	/* Re-calculate the max to fit the HWQ allocation model */
@@ -753,10 +757,15 @@
 	if (pages > MAX_PBL_LVL_1_PGS)
 		return -ENOMEM;
 
-	frpl->hwq.max_elements = pages;
-	rc = bnxt_qplib_alloc_init_hwq(res->pdev, &frpl->hwq, NULL,
-				       &frpl->hwq.max_elements, PAGE_SIZE, 0,
-				       PAGE_SIZE, HWQ_TYPE_CTX);
+	sginfo.pgsize = PAGE_SIZE;
+	sginfo.nopte = true;
+
+	hwq_attr.res = res;
+	hwq_attr.depth = pg_ptrs;
+	hwq_attr.stride = PAGE_SIZE;
+	hwq_attr.sginfo = &sginfo;
+	hwq_attr.type = HWQ_TYPE_CTX;
+	rc = bnxt_qplib_alloc_init_hwq(&frpl->hwq, &hwq_attr);
 	if (!rc)
 		frpl->max_pg_ptrs = pg_ptrs;
 
@@ -766,7 +775,7 @@
 int bnxt_qplib_free_fast_reg_page_list(struct bnxt_qplib_res *res,
 				       struct bnxt_qplib_frpl *frpl)
 {
-	bnxt_qplib_free_hwq(res->pdev, &frpl->hwq);
+	bnxt_qplib_free_hwq(res, &frpl->hwq);
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
index 194f5ef..967890c 100644
--- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h
+++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h
@@ -65,8 +65,6 @@
 	u32				max_mw;
 	u32				max_raw_ethy_qp;
 	u32				max_ah;
-	u32				max_fmr;
-	u32				max_map_per_fmr;
 	u32				max_srq;
 	u32				max_srq_wqes;
 	u32				max_srq_sges;
diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
index e4b09e7..3e40e0d 100644
--- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h
+++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h
@@ -210,6 +210,20 @@
 	__le32 data[24];
 };
 
+/* sq_send_hdr (size:256b/32B) */
+struct sq_send_hdr {
+	u8	wqe_type;
+	u8	flags;
+	u8	wqe_size;
+	u8	reserved8_1;
+	__le32	inv_key_or_imm_data;
+	__le32	length;
+	__le32	q_key;
+	__le32	dst_qp;
+	__le32	avid;
+	__le64	reserved64;
+};
+
 /* Send Raw Ethernet and QP1 SQ WQE (40 bytes) */
 struct sq_send_raweth_qp1 {
 	u8 wqe_type;
@@ -265,6 +279,21 @@
 	__le32 data[24];
 };
 
+/* sq_send_raweth_qp1_hdr (size:256b/32B) */
+struct sq_send_raweth_qp1_hdr {
+	u8	wqe_type;
+	u8	flags;
+	u8	wqe_size;
+	u8	reserved8;
+	__le16	lflags;
+	__le16	cfa_action;
+	__le32	length;
+	__le32	reserved32_1;
+	__le32	cfa_meta;
+	__le32	reserved32_2;
+	__le64	reserved64;
+};
+
 /* RDMA SQ WQE (40 bytes) */
 struct sq_rdma {
 	u8 wqe_type;
@@ -288,6 +317,20 @@
 	__le32 data[24];
 };
 
+/* sq_rdma_hdr (size:256b/32B) */
+struct sq_rdma_hdr {
+	u8	wqe_type;
+	u8	flags;
+	u8	wqe_size;
+	u8	reserved8;
+	__le32	imm_data;
+	__le32	length;
+	__le32	reserved32_1;
+	__le64	remote_va;
+	__le32	remote_key;
+	__le32	reserved32_2;
+};
+
 /* Atomic SQ WQE (40 bytes) */
 struct sq_atomic {
 	u8 wqe_type;
@@ -307,6 +350,17 @@
 	__le32 data[24];
 };
 
+/* sq_atomic_hdr (size:256b/32B) */
+struct sq_atomic_hdr {
+	u8	wqe_type;
+	u8	flags;
+	__le16	reserved16;
+	__le32	remote_key;
+	__le64	remote_va;
+	__le64	swap_data;
+	__le64	cmp_data;
+};
+
 /* Local Invalidate SQ WQE (40 bytes) */
 struct sq_localinvalidate {
 	u8 wqe_type;
@@ -324,6 +378,16 @@
 	__le32 data[24];
 };
 
+/* sq_localinvalidate_hdr (size:256b/32B) */
+struct sq_localinvalidate_hdr {
+	u8	wqe_type;
+	u8	flags;
+	__le16	reserved16;
+	__le32	inv_l_key;
+	__le64	reserved64;
+	u8	reserved128[16];
+};
+
 /* FR-PMR SQ WQE (40 bytes) */
 struct sq_fr_pmr {
 	u8 wqe_type;
@@ -380,6 +444,21 @@
 	__le32 data[24];
 };
 
+/* sq_fr_pmr_hdr (size:256b/32B) */
+struct sq_fr_pmr_hdr {
+	u8	wqe_type;
+	u8	flags;
+	u8	access_cntl;
+	u8	zero_based_page_size_log;
+	__le32	l_key;
+	u8	length[5];
+	u8	reserved8_1;
+	u8	reserved8_2;
+	u8	numlevels_pbl_page_size_log;
+	__le64	pblptr;
+	__le64	va;
+};
+
 /* Bind SQ WQE (40 bytes) */
 struct sq_bind {
 	u8 wqe_type;
@@ -417,6 +496,22 @@
 	#define SQ_BIND_DATA_SFT				    0
 };
 
+/* sq_bind_hdr (size:256b/32B) */
+struct sq_bind_hdr {
+	u8	wqe_type;
+	u8	flags;
+	u8	access_cntl;
+	u8	reserved8_1;
+	u8	mw_type_zero_based;
+	u8	reserved8_2;
+	__le16	reserved16;
+	__le32	parent_l_key;
+	__le32	l_key;
+	__le64	va;
+	u8	length[5];
+	u8	reserved24[3];
+};
+
 /* RQ/SRQ WQE Structures */
 /* RQ/SRQ WQE (40 bytes) */
 struct rq_wqe {
@@ -435,6 +530,17 @@
 	__le32 data[24];
 };
 
+/* rq_wqe_hdr (size:256b/32B) */
+struct rq_wqe_hdr {
+	u8	wqe_type;
+	u8	flags;
+	u8	wqe_size;
+	u8	reserved8;
+	__le32	reserved32;
+	__le32	wr_id[2];
+	u8	reserved128[16];
+};
+
 /* CQ CQE Structures */
 /* Base CQE (32 bytes) */
 struct cq_base {
@@ -1020,6 +1126,7 @@
 	#define CMDQ_CREATE_QP_QP_FLAGS_FORCE_COMPLETION	   0x2UL
 	#define CMDQ_CREATE_QP_QP_FLAGS_RESERVED_LKEY_ENABLE      0x4UL
 	#define CMDQ_CREATE_QP_QP_FLAGS_FR_PMR_ENABLED		   0x8UL
+	#define CMDQ_CREATE_QP_QP_FLAGS_VARIABLE_SIZED_WQE_ENABLED 0x10UL
 	u8 type;
 	#define CMDQ_CREATE_QP_TYPE_RC				   0x2UL
 	#define CMDQ_CREATE_QP_TYPE_UD				   0x4UL
diff --git a/drivers/infiniband/hw/cxgb3/Kconfig b/drivers/infiniband/hw/cxgb3/Kconfig
deleted file mode 100644
index 8c1a72b..0000000
--- a/drivers/infiniband/hw/cxgb3/Kconfig
+++ /dev/null
@@ -1,19 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config INFINIBAND_CXGB3
-	tristate "Chelsio RDMA Driver"
-	depends on CHELSIO_T3
-	select GENERIC_ALLOCATOR
-	---help---
-	  This is an iWARP/RDMA driver for the Chelsio T3 1GbE and
-	  10GbE adapters.
-
-	  For general information about Chelsio and our products, visit
-	  our website at <http://www.chelsio.com>.
-
-	  For customer support, please visit our customer support page at
-	  <http://www.chelsio.com/support.html>.
-
-	  Please send feedback to <linux-bugs@chelsio.com>.
-
-	  To compile this driver as a module, choose M here: the module
-	  will be called iw_cxgb3.
diff --git a/drivers/infiniband/hw/cxgb3/Makefile b/drivers/infiniband/hw/cxgb3/Makefile
deleted file mode 100644
index 34bb86a..0000000
--- a/drivers/infiniband/hw/cxgb3/Makefile
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-ccflags-y := -I $(srctree)/drivers/net/ethernet/chelsio/cxgb3
-
-obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o
-
-iw_cxgb3-y :=  iwch_cm.o iwch_ev.o iwch_cq.o iwch_qp.o iwch_mem.o \
-	       iwch_provider.o iwch.o cxio_hal.o cxio_resource.o
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
deleted file mode 100644
index 95b22a6..0000000
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.c
+++ /dev/null
@@ -1,1312 +0,0 @@
-/*
- * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <asm/delay.h>
-
-#include <linux/mutex.h>
-#include <linux/netdevice.h>
-#include <linux/sched.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <linux/slab.h>
-#include <net/net_namespace.h>
-
-#include "cxio_resource.h"
-#include "cxio_hal.h"
-#include "cxgb3_offload.h"
-#include "sge_defs.h"
-
-static LIST_HEAD(rdev_list);
-static cxio_hal_ev_callback_func_t cxio_ev_cb = NULL;
-
-static struct cxio_rdev *cxio_hal_find_rdev_by_name(char *dev_name)
-{
-	struct cxio_rdev *rdev;
-
-	list_for_each_entry(rdev, &rdev_list, entry)
-		if (!strcmp(rdev->dev_name, dev_name))
-			return rdev;
-	return NULL;
-}
-
-static struct cxio_rdev *cxio_hal_find_rdev_by_t3cdev(struct t3cdev *tdev)
-{
-	struct cxio_rdev *rdev;
-
-	list_for_each_entry(rdev, &rdev_list, entry)
-		if (rdev->t3cdev_p == tdev)
-			return rdev;
-	return NULL;
-}
-
-int cxio_hal_cq_op(struct cxio_rdev *rdev_p, struct t3_cq *cq,
-		   enum t3_cq_opcode op, u32 credit)
-{
-	int ret;
-	struct t3_cqe *cqe;
-	u32 rptr;
-
-	struct rdma_cq_op setup;
-	setup.id = cq->cqid;
-	setup.credits = (op == CQ_CREDIT_UPDATE) ? credit : 0;
-	setup.op = op;
-	ret = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_OP, &setup);
-
-	if ((ret < 0) || (op == CQ_CREDIT_UPDATE))
-		return ret;
-
-	/*
-	 * If the rearm returned an index other than our current index,
-	 * then there might be CQE's in flight (being DMA'd).  We must wait
-	 * here for them to complete or the consumer can miss a notification.
-	 */
-	if (Q_PTR2IDX((cq->rptr), cq->size_log2) != ret) {
-		int i=0;
-
-		rptr = cq->rptr;
-
-		/*
-		 * Keep the generation correct by bumping rptr until it
-		 * matches the index returned by the rearm - 1.
-		 */
-		while (Q_PTR2IDX((rptr+1), cq->size_log2) != ret)
-			rptr++;
-
-		/*
-		 * Now rptr is the index for the (last) cqe that was
-		 * in-flight at the time the HW rearmed the CQ.  We
-		 * spin until that CQE is valid.
-		 */
-		cqe = cq->queue + Q_PTR2IDX(rptr, cq->size_log2);
-		while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) {
-			udelay(1);
-			if (i++ > 1000000) {
-				pr_err("%s: stalled rnic\n", rdev_p->dev_name);
-				return -EIO;
-			}
-		}
-
-		return 1;
-	}
-
-	return 0;
-}
-
-static int cxio_hal_clear_cq_ctx(struct cxio_rdev *rdev_p, u32 cqid)
-{
-	struct rdma_cq_setup setup;
-	setup.id = cqid;
-	setup.base_addr = 0;	/* NULL address */
-	setup.size = 0;		/* disaable the CQ */
-	setup.credits = 0;
-	setup.credit_thres = 0;
-	setup.ovfl_mode = 0;
-	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
-}
-
-static int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid)
-{
-	u64 sge_cmd;
-	struct t3_modify_qp_wr *wqe;
-	struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_KERNEL);
-	if (!skb) {
-		pr_debug("%s alloc_skb failed\n", __func__);
-		return -ENOMEM;
-	}
-	wqe = skb_put_zero(skb, sizeof(*wqe));
-	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD,
-		       T3_COMPLETION_FLAG | T3_NOTIFY_FLAG, 0, qpid, 7,
-		       T3_SOPEOP);
-	wqe->flags = cpu_to_be32(MODQP_WRITE_EC);
-	sge_cmd = qpid << 8 | 3;
-	wqe->sge_cmd = cpu_to_be64(sge_cmd);
-	skb->priority = CPL_PRIORITY_CONTROL;
-	return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
-}
-
-int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel)
-{
-	struct rdma_cq_setup setup;
-	int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe);
-
-	size += 1; /* one extra page for storing cq-in-err state */
-	cq->cqid = cxio_hal_get_cqid(rdev_p->rscp);
-	if (!cq->cqid)
-		return -ENOMEM;
-	if (kernel) {
-		cq->sw_queue = kzalloc(size, GFP_KERNEL);
-		if (!cq->sw_queue)
-			return -ENOMEM;
-	}
-	cq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev), size,
-					     &(cq->dma_addr), GFP_KERNEL);
-	if (!cq->queue) {
-		kfree(cq->sw_queue);
-		return -ENOMEM;
-	}
-	dma_unmap_addr_set(cq, mapping, cq->dma_addr);
-	setup.id = cq->cqid;
-	setup.base_addr = (u64) (cq->dma_addr);
-	setup.size = 1UL << cq->size_log2;
-	setup.credits = 65535;
-	setup.credit_thres = 1;
-	if (rdev_p->t3cdev_p->type != T3A)
-		setup.ovfl_mode = 0;
-	else
-		setup.ovfl_mode = 1;
-	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
-}
-
-static u32 get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
-{
-	struct cxio_qpid_list *entry;
-	u32 qpid;
-	int i;
-
-	mutex_lock(&uctx->lock);
-	if (!list_empty(&uctx->qpids)) {
-		entry = list_entry(uctx->qpids.next, struct cxio_qpid_list,
-				   entry);
-		list_del(&entry->entry);
-		qpid = entry->qpid;
-		kfree(entry);
-	} else {
-		qpid = cxio_hal_get_qpid(rdev_p->rscp);
-		if (!qpid)
-			goto out;
-		for (i = qpid+1; i & rdev_p->qpmask; i++) {
-			entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-			if (!entry)
-				break;
-			entry->qpid = i;
-			list_add_tail(&entry->entry, &uctx->qpids);
-		}
-	}
-out:
-	mutex_unlock(&uctx->lock);
-	pr_debug("%s qpid 0x%x\n", __func__, qpid);
-	return qpid;
-}
-
-static void put_qpid(struct cxio_rdev *rdev_p, u32 qpid,
-		     struct cxio_ucontext *uctx)
-{
-	struct cxio_qpid_list *entry;
-
-	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		return;
-	pr_debug("%s qpid 0x%x\n", __func__, qpid);
-	entry->qpid = qpid;
-	mutex_lock(&uctx->lock);
-	list_add_tail(&entry->entry, &uctx->qpids);
-	mutex_unlock(&uctx->lock);
-}
-
-void cxio_release_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
-{
-	struct list_head *pos, *nxt;
-	struct cxio_qpid_list *entry;
-
-	mutex_lock(&uctx->lock);
-	list_for_each_safe(pos, nxt, &uctx->qpids) {
-		entry = list_entry(pos, struct cxio_qpid_list, entry);
-		list_del_init(&entry->entry);
-		if (!(entry->qpid & rdev_p->qpmask))
-			cxio_hal_put_qpid(rdev_p->rscp, entry->qpid);
-		kfree(entry);
-	}
-	mutex_unlock(&uctx->lock);
-}
-
-void cxio_init_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
-{
-	INIT_LIST_HEAD(&uctx->qpids);
-	mutex_init(&uctx->lock);
-}
-
-int cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain,
-		   struct t3_wq *wq, struct cxio_ucontext *uctx)
-{
-	int depth = 1UL << wq->size_log2;
-	int rqsize = 1UL << wq->rq_size_log2;
-
-	wq->qpid = get_qpid(rdev_p, uctx);
-	if (!wq->qpid)
-		return -ENOMEM;
-
-	wq->rq = kcalloc(depth, sizeof(struct t3_swrq), GFP_KERNEL);
-	if (!wq->rq)
-		goto err1;
-
-	wq->rq_addr = cxio_hal_rqtpool_alloc(rdev_p, rqsize);
-	if (!wq->rq_addr)
-		goto err2;
-
-	wq->sq = kcalloc(depth, sizeof(struct t3_swsq), GFP_KERNEL);
-	if (!wq->sq)
-		goto err3;
-
-	wq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev),
-				       depth * sizeof(union t3_wr),
-				       &(wq->dma_addr), GFP_KERNEL);
-	if (!wq->queue)
-		goto err4;
-
-	dma_unmap_addr_set(wq, mapping, wq->dma_addr);
-	wq->doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
-	if (!kernel_domain)
-		wq->udb = (u64)rdev_p->rnic_info.udbell_physbase +
-					(wq->qpid << rdev_p->qpshift);
-	wq->rdev = rdev_p;
-	pr_debug("%s qpid 0x%x doorbell 0x%p udb 0x%llx\n",
-		 __func__, wq->qpid, wq->doorbell, (unsigned long long)wq->udb);
-	return 0;
-err4:
-	kfree(wq->sq);
-err3:
-	cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, rqsize);
-err2:
-	kfree(wq->rq);
-err1:
-	put_qpid(rdev_p, wq->qpid, uctx);
-	return -ENOMEM;
-}
-
-void cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
-{
-	cxio_hal_clear_cq_ctx(rdev_p, cq->cqid);
-	kfree(cq->sw_queue);
-	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
-			  (1UL << (cq->size_log2))
-			  * sizeof(struct t3_cqe) + 1, cq->queue,
-			  dma_unmap_addr(cq, mapping));
-	cxio_hal_put_cqid(rdev_p->rscp, cq->cqid);
-}
-
-int cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq,
-		    struct cxio_ucontext *uctx)
-{
-	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
-			  (1UL << (wq->size_log2))
-			  * sizeof(union t3_wr), wq->queue,
-			  dma_unmap_addr(wq, mapping));
-	kfree(wq->sq);
-	cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, (1UL << wq->rq_size_log2));
-	kfree(wq->rq);
-	put_qpid(rdev_p, wq->qpid, uctx);
-	return 0;
-}
-
-static void insert_recv_cqe(struct t3_wq *wq, struct t3_cq *cq)
-{
-	struct t3_cqe cqe;
-
-	pr_debug("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __func__,
-		 wq, cq, cq->sw_rptr, cq->sw_wptr);
-	memset(&cqe, 0, sizeof(cqe));
-	cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) |
-			         V_CQE_OPCODE(T3_SEND) |
-				 V_CQE_TYPE(0) |
-				 V_CQE_SWCQE(1) |
-				 V_CQE_QPID(wq->qpid) |
-				 V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr,
-						       cq->size_log2)));
-	*(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe;
-	cq->sw_wptr++;
-}
-
-int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count)
-{
-	u32 ptr;
-	int flushed = 0;
-
-	pr_debug("%s wq %p cq %p\n", __func__, wq, cq);
-
-	/* flush RQ */
-	pr_debug("%s rq_rptr %u rq_wptr %u skip count %u\n", __func__,
-		 wq->rq_rptr, wq->rq_wptr, count);
-	ptr = wq->rq_rptr + count;
-	while (ptr++ != wq->rq_wptr) {
-		insert_recv_cqe(wq, cq);
-		flushed++;
-	}
-	return flushed;
-}
-
-static void insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq,
-		          struct t3_swsq *sqp)
-{
-	struct t3_cqe cqe;
-
-	pr_debug("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __func__,
-		 wq, cq, cq->sw_rptr, cq->sw_wptr);
-	memset(&cqe, 0, sizeof(cqe));
-	cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) |
-			         V_CQE_OPCODE(sqp->opcode) |
-			         V_CQE_TYPE(1) |
-			         V_CQE_SWCQE(1) |
-			         V_CQE_QPID(wq->qpid) |
-			         V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr,
-						       cq->size_log2)));
-	cqe.u.scqe.wrid_hi = sqp->sq_wptr;
-
-	*(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe;
-	cq->sw_wptr++;
-}
-
-int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count)
-{
-	__u32 ptr = wq->sq_rptr + count;
-	int flushed = 0;
-	struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
-
-	while (ptr != wq->sq_wptr) {
-		sqp->signaled = 0;
-		insert_sq_cqe(wq, cq, sqp);
-		ptr++;
-		sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
-		flushed++;
-	}
-	return flushed;
-}
-
-/*
- * Move all CQEs from the HWCQ into the SWCQ.
- */
-void cxio_flush_hw_cq(struct t3_cq *cq)
-{
-	struct t3_cqe *cqe, *swcqe;
-
-	pr_debug("%s cq %p cqid 0x%x\n", __func__, cq, cq->cqid);
-	cqe = cxio_next_hw_cqe(cq);
-	while (cqe) {
-		pr_debug("%s flushing hwcq rptr 0x%x to swcq wptr 0x%x\n",
-			 __func__, cq->rptr, cq->sw_wptr);
-		swcqe = cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2);
-		*swcqe = *cqe;
-		swcqe->header |= cpu_to_be32(V_CQE_SWCQE(1));
-		cq->sw_wptr++;
-		cq->rptr++;
-		cqe = cxio_next_hw_cqe(cq);
-	}
-}
-
-static int cqe_completes_wr(struct t3_cqe *cqe, struct t3_wq *wq)
-{
-	if (CQE_OPCODE(*cqe) == T3_TERMINATE)
-		return 0;
-
-	if ((CQE_OPCODE(*cqe) == T3_RDMA_WRITE) && RQ_TYPE(*cqe))
-		return 0;
-
-	if ((CQE_OPCODE(*cqe) == T3_READ_RESP) && SQ_TYPE(*cqe))
-		return 0;
-
-	if (CQE_SEND_OPCODE(*cqe) && RQ_TYPE(*cqe) &&
-	    Q_EMPTY(wq->rq_rptr, wq->rq_wptr))
-		return 0;
-
-	return 1;
-}
-
-void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count)
-{
-	struct t3_cqe *cqe;
-	u32 ptr;
-
-	*count = 0;
-	ptr = cq->sw_rptr;
-	while (!Q_EMPTY(ptr, cq->sw_wptr)) {
-		cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2));
-		if ((SQ_TYPE(*cqe) ||
-		     ((CQE_OPCODE(*cqe) == T3_READ_RESP) && wq->oldest_read)) &&
-		    (CQE_QPID(*cqe) == wq->qpid))
-			(*count)++;
-		ptr++;
-	}
-	pr_debug("%s cq %p count %d\n", __func__, cq, *count);
-}
-
-void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count)
-{
-	struct t3_cqe *cqe;
-	u32 ptr;
-
-	*count = 0;
-	pr_debug("%s count zero %d\n", __func__, *count);
-	ptr = cq->sw_rptr;
-	while (!Q_EMPTY(ptr, cq->sw_wptr)) {
-		cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2));
-		if (RQ_TYPE(*cqe) && (CQE_OPCODE(*cqe) != T3_READ_RESP) &&
-		    (CQE_QPID(*cqe) == wq->qpid) && cqe_completes_wr(cqe, wq))
-			(*count)++;
-		ptr++;
-	}
-	pr_debug("%s cq %p count %d\n", __func__, cq, *count);
-}
-
-static int cxio_hal_init_ctrl_cq(struct cxio_rdev *rdev_p)
-{
-	struct rdma_cq_setup setup;
-	setup.id = 0;
-	setup.base_addr = 0;	/* NULL address */
-	setup.size = 1;		/* enable the CQ */
-	setup.credits = 0;
-
-	/* force SGE to redirect to RspQ and interrupt */
-	setup.credit_thres = 0;
-	setup.ovfl_mode = 1;
-	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
-}
-
-static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p)
-{
-	int err;
-	u64 sge_cmd, ctx0, ctx1;
-	u64 base_addr;
-	struct t3_modify_qp_wr *wqe;
-	struct sk_buff *skb;
-
-	skb = alloc_skb(sizeof(*wqe), GFP_KERNEL);
-	if (!skb) {
-		pr_debug("%s alloc_skb failed\n", __func__);
-		return -ENOMEM;
-	}
-	err = cxio_hal_init_ctrl_cq(rdev_p);
-	if (err) {
-		pr_debug("%s err %d initializing ctrl_cq\n", __func__, err);
-		goto err;
-	}
-	rdev_p->ctrl_qp.workq = dma_alloc_coherent(
-					&(rdev_p->rnic_info.pdev->dev),
-					(1 << T3_CTRL_QP_SIZE_LOG2) *
-					sizeof(union t3_wr),
-					&(rdev_p->ctrl_qp.dma_addr),
-					GFP_KERNEL);
-	if (!rdev_p->ctrl_qp.workq) {
-		pr_debug("%s dma_alloc_coherent failed\n", __func__);
-		err = -ENOMEM;
-		goto err;
-	}
-	dma_unmap_addr_set(&rdev_p->ctrl_qp, mapping,
-			   rdev_p->ctrl_qp.dma_addr);
-	rdev_p->ctrl_qp.doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
-
-	mutex_init(&rdev_p->ctrl_qp.lock);
-	init_waitqueue_head(&rdev_p->ctrl_qp.waitq);
-
-	/* update HW Ctrl QP context */
-	base_addr = rdev_p->ctrl_qp.dma_addr;
-	base_addr >>= 12;
-	ctx0 = (V_EC_SIZE((1 << T3_CTRL_QP_SIZE_LOG2)) |
-		V_EC_BASE_LO((u32) base_addr & 0xffff));
-	ctx0 <<= 32;
-	ctx0 |= V_EC_CREDITS(FW_WR_NUM);
-	base_addr >>= 16;
-	ctx1 = (u32) base_addr;
-	base_addr >>= 32;
-	ctx1 |= ((u64) (V_EC_BASE_HI((u32) base_addr & 0xf) | V_EC_RESPQ(0) |
-			V_EC_TYPE(0) | V_EC_GEN(1) |
-			V_EC_UP_TOKEN(T3_CTL_QP_TID) | F_EC_VALID)) << 32;
-	wqe = skb_put_zero(skb, sizeof(*wqe));
-	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 0, 0,
-		       T3_CTL_QP_TID, 7, T3_SOPEOP);
-	wqe->flags = cpu_to_be32(MODQP_WRITE_EC);
-	sge_cmd = (3ULL << 56) | FW_RI_SGEEC_START << 8 | 3;
-	wqe->sge_cmd = cpu_to_be64(sge_cmd);
-	wqe->ctx1 = cpu_to_be64(ctx1);
-	wqe->ctx0 = cpu_to_be64(ctx0);
-	pr_debug("CtrlQP dma_addr %pad workq %p size %d\n",
-		 &rdev_p->ctrl_qp.dma_addr, rdev_p->ctrl_qp.workq,
-		 1 << T3_CTRL_QP_SIZE_LOG2);
-	skb->priority = CPL_PRIORITY_CONTROL;
-	return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
-err:
-	kfree_skb(skb);
-	return err;
-}
-
-static int cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p)
-{
-	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
-			  (1UL << T3_CTRL_QP_SIZE_LOG2)
-			  * sizeof(union t3_wr), rdev_p->ctrl_qp.workq,
-			  dma_unmap_addr(&rdev_p->ctrl_qp, mapping));
-	return cxio_hal_clear_qp_ctx(rdev_p, T3_CTRL_QP_ID);
-}
-
-/* write len bytes of data into addr (32B aligned address)
- * If data is NULL, clear len byte of memory to zero.
- * caller acquires the ctrl_qp lock before the call
- */
-static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
-				      u32 len, void *data)
-{
-	u32 i, nr_wqe, copy_len;
-	u8 *copy_data;
-	u8 wr_len, utx_len;	/* length in 8 byte flit */
-	enum t3_wr_flags flag;
-	__be64 *wqe;
-	u64 utx_cmd;
-	addr &= 0x7FFFFFF;
-	nr_wqe = len % 96 ? len / 96 + 1 : len / 96;	/* 96B max per WQE */
-	pr_debug("%s wptr 0x%x rptr 0x%x len %d, nr_wqe %d data %p addr 0x%0x\n",
-		 __func__, rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, len,
-		 nr_wqe, data, addr);
-	utx_len = 3;		/* in 32B unit */
-	for (i = 0; i < nr_wqe; i++) {
-		if (Q_FULL(rdev_p->ctrl_qp.rptr, rdev_p->ctrl_qp.wptr,
-		           T3_CTRL_QP_SIZE_LOG2)) {
-			pr_debug("%s ctrl_qp full wtpr 0x%0x rptr 0x%0x, wait for more space i %d\n",
-				 __func__,
-				 rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, i);
-			if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
-					     !Q_FULL(rdev_p->ctrl_qp.rptr,
-						     rdev_p->ctrl_qp.wptr,
-						     T3_CTRL_QP_SIZE_LOG2))) {
-				pr_debug("%s ctrl_qp workq interrupted\n",
-					 __func__);
-				return -ERESTARTSYS;
-			}
-			pr_debug("%s ctrl_qp wakeup, continue posting work request i %d\n",
-				 __func__, i);
-		}
-		wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr %
-						(1 << T3_CTRL_QP_SIZE_LOG2)));
-		flag = 0;
-		if (i == (nr_wqe - 1)) {
-			/* last WQE */
-			flag = T3_COMPLETION_FLAG;
-			if (len % 32)
-				utx_len = len / 32 + 1;
-			else
-				utx_len = len / 32;
-		}
-
-		/*
-		 * Force a CQE to return the credit to the workq in case
-		 * we posted more than half the max QP size of WRs
-		 */
-		if ((i != 0) &&
-		    (i % (((1 << T3_CTRL_QP_SIZE_LOG2)) >> 1) == 0)) {
-			flag = T3_COMPLETION_FLAG;
-			pr_debug("%s force completion at i %d\n", __func__, i);
-		}
-
-		/* build the utx mem command */
-		wqe += (sizeof(struct t3_bypass_wr) >> 3);
-		utx_cmd = (T3_UTX_MEM_WRITE << 28) | (addr + i * 3);
-		utx_cmd <<= 32;
-		utx_cmd |= (utx_len << 28) | ((utx_len << 2) + 1);
-		*wqe = cpu_to_be64(utx_cmd);
-		wqe++;
-		copy_data = (u8 *) data + i * 96;
-		copy_len = len > 96 ? 96 : len;
-
-		/* clear memory content if data is NULL */
-		if (data)
-			memcpy(wqe, copy_data, copy_len);
-		else
-			memset(wqe, 0, copy_len);
-		if (copy_len % 32)
-			memset(((u8 *) wqe) + copy_len, 0,
-			       32 - (copy_len % 32));
-		wr_len = ((sizeof(struct t3_bypass_wr)) >> 3) + 1 +
-			 (utx_len << 2);
-		wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr %
-			      (1 << T3_CTRL_QP_SIZE_LOG2)));
-
-		/* wptr in the WRID[31:0] */
-		((union t3_wrid *)(wqe+1))->id0.low = rdev_p->ctrl_qp.wptr;
-
-		/*
-		 * This must be the last write with a memory barrier
-		 * for the genbit
-		 */
-		build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_BP, flag,
-			       Q_GENBIT(rdev_p->ctrl_qp.wptr,
-					T3_CTRL_QP_SIZE_LOG2), T3_CTRL_QP_ID,
-			       wr_len, T3_SOPEOP);
-		if (flag == T3_COMPLETION_FLAG)
-			ring_doorbell(rdev_p->ctrl_qp.doorbell, T3_CTRL_QP_ID);
-		len -= 96;
-		rdev_p->ctrl_qp.wptr++;
-	}
-	return 0;
-}
-
-/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl_size and pbl_addr
- * OUT: stag index
- * TBD: shared memory region support
- */
-static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry,
-			 u32 *stag, u8 stag_state, u32 pdid,
-			 enum tpt_mem_type type, enum tpt_mem_perm perm,
-			 u32 zbva, u64 to, u32 len, u8 page_size,
-			 u32 pbl_size, u32 pbl_addr)
-{
-	int err;
-	struct tpt_entry tpt;
-	u32 stag_idx;
-	u32 wptr;
-
-	if (cxio_fatal_error(rdev_p))
-		return -EIO;
-
-	stag_state = stag_state > 0;
-	stag_idx = (*stag) >> 8;
-
-	if ((!reset_tpt_entry) && !(*stag != T3_STAG_UNSET)) {
-		stag_idx = cxio_hal_get_stag(rdev_p->rscp);
-		if (!stag_idx)
-			return -ENOMEM;
-		*stag = (stag_idx << 8) | ((*stag) & 0xFF);
-	}
-	pr_debug("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n",
-		 __func__, stag_state, type, pdid, stag_idx);
-
-	mutex_lock(&rdev_p->ctrl_qp.lock);
-
-	/* write TPT entry */
-	if (reset_tpt_entry)
-		memset(&tpt, 0, sizeof(tpt));
-	else {
-		tpt.valid_stag_pdid = cpu_to_be32(F_TPT_VALID |
-				V_TPT_STAG_KEY((*stag) & M_TPT_STAG_KEY) |
-				V_TPT_STAG_STATE(stag_state) |
-				V_TPT_STAG_TYPE(type) | V_TPT_PDID(pdid));
-		BUG_ON(page_size >= 28);
-		tpt.flags_pagesize_qpid = cpu_to_be32(V_TPT_PERM(perm) |
-			((perm & TPT_MW_BIND) ? F_TPT_MW_BIND_ENABLE : 0) |
-			V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) |
-			V_TPT_PAGE_SIZE(page_size));
-		tpt.rsvd_pbl_addr = cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3));
-		tpt.len = cpu_to_be32(len);
-		tpt.va_hi = cpu_to_be32((u32) (to >> 32));
-		tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL));
-		tpt.rsvd_bind_cnt_or_pstag = 0;
-		tpt.rsvd_pbl_size = cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2));
-	}
-	err = cxio_hal_ctrl_qp_write_mem(rdev_p,
-				       stag_idx +
-				       (rdev_p->rnic_info.tpt_base >> 5),
-				       sizeof(tpt), &tpt);
-
-	/* release the stag index to free pool */
-	if (reset_tpt_entry)
-		cxio_hal_put_stag(rdev_p->rscp, stag_idx);
-
-	wptr = rdev_p->ctrl_qp.wptr;
-	mutex_unlock(&rdev_p->ctrl_qp.lock);
-	if (!err)
-		if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
-					     SEQ32_GE(rdev_p->ctrl_qp.rptr,
-						      wptr)))
-			return -ERESTARTSYS;
-	return err;
-}
-
-int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl,
-		   u32 pbl_addr, u32 pbl_size)
-{
-	u32 wptr;
-	int err;
-
-	pr_debug("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
-		 __func__, pbl_addr, rdev_p->rnic_info.pbl_base,
-		 pbl_size);
-
-	mutex_lock(&rdev_p->ctrl_qp.lock);
-	err = cxio_hal_ctrl_qp_write_mem(rdev_p, pbl_addr >> 5, pbl_size << 3,
-					 pbl);
-	wptr = rdev_p->ctrl_qp.wptr;
-	mutex_unlock(&rdev_p->ctrl_qp.lock);
-	if (err)
-		return err;
-
-	if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
-				     SEQ32_GE(rdev_p->ctrl_qp.rptr,
-					      wptr)))
-		return -ERESTARTSYS;
-
-	return 0;
-}
-
-int cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
-			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
-			   u8 page_size, u32 pbl_size, u32 pbl_addr)
-{
-	*stag = T3_STAG_UNSET;
-	return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
-			     zbva, to, len, page_size, pbl_size, pbl_addr);
-}
-
-int cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
-			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
-			   u8 page_size, u32 pbl_size, u32 pbl_addr)
-{
-	return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
-			     zbva, to, len, page_size, pbl_size, pbl_addr);
-}
-
-int cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size,
-		   u32 pbl_addr)
-{
-	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0,
-			     pbl_size, pbl_addr);
-}
-
-int cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid)
-{
-	*stag = T3_STAG_UNSET;
-	return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0,
-			     0, 0);
-}
-
-int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag)
-{
-	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0,
-			     0, 0);
-}
-
-int cxio_allocate_stag(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr)
-{
-	*stag = T3_STAG_UNSET;
-	return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_NON_SHARED_MR,
-			     0, 0, 0ULL, 0, 0, pbl_size, pbl_addr);
-}
-
-int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr)
-{
-	struct t3_rdma_init_wr *wqe;
-	struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_ATOMIC);
-	if (!skb)
-		return -ENOMEM;
-	pr_debug("%s rdev_p %p\n", __func__, rdev_p);
-	wqe = __skb_put(skb, sizeof(*wqe));
-	wqe->wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_INIT));
-	wqe->wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(attr->tid) |
-					   V_FW_RIWR_LEN(sizeof(*wqe) >> 3));
-	wqe->wrid.id1 = 0;
-	wqe->qpid = cpu_to_be32(attr->qpid);
-	wqe->pdid = cpu_to_be32(attr->pdid);
-	wqe->scqid = cpu_to_be32(attr->scqid);
-	wqe->rcqid = cpu_to_be32(attr->rcqid);
-	wqe->rq_addr = cpu_to_be32(attr->rq_addr - rdev_p->rnic_info.rqt_base);
-	wqe->rq_size = cpu_to_be32(attr->rq_size);
-	wqe->mpaattrs = attr->mpaattrs;
-	wqe->qpcaps = attr->qpcaps;
-	wqe->ulpdu_size = cpu_to_be16(attr->tcp_emss);
-	wqe->rqe_count = cpu_to_be16(attr->rqe_count);
-	wqe->flags_rtr_type = cpu_to_be16(attr->flags |
-					  V_RTR_TYPE(attr->rtr_type) |
-					  V_CHAN(attr->chan));
-	wqe->ord = cpu_to_be32(attr->ord);
-	wqe->ird = cpu_to_be32(attr->ird);
-	wqe->qp_dma_addr = cpu_to_be64(attr->qp_dma_addr);
-	wqe->qp_dma_size = cpu_to_be32(attr->qp_dma_size);
-	wqe->irs = cpu_to_be32(attr->irs);
-	skb->priority = 0;	/* 0=>ToeQ; 1=>CtrlQ */
-	return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
-}
-
-void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb)
-{
-	cxio_ev_cb = ev_cb;
-}
-
-void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb)
-{
-	cxio_ev_cb = NULL;
-}
-
-static int cxio_hal_ev_handler(struct t3cdev *t3cdev_p, struct sk_buff *skb)
-{
-	static int cnt;
-	struct cxio_rdev *rdev_p = NULL;
-	struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data;
-	pr_debug("%d: %s cq_id 0x%x cq_ptr 0x%x genbit %0x overflow %0x an %0x se %0x notify %0x cqbranch %0x creditth %0x\n",
-		 cnt, __func__, RSPQ_CQID(rsp_msg), RSPQ_CQPTR(rsp_msg),
-		 RSPQ_GENBIT(rsp_msg), RSPQ_OVERFLOW(rsp_msg), RSPQ_AN(rsp_msg),
-		 RSPQ_SE(rsp_msg), RSPQ_NOTIFY(rsp_msg), RSPQ_CQBRANCH(rsp_msg),
-		 RSPQ_CREDIT_THRESH(rsp_msg));
-	pr_debug("CQE: QPID 0x%0x genbit %0x type 0x%0x status 0x%0x opcode %d len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
-		 CQE_QPID(rsp_msg->cqe), CQE_GENBIT(rsp_msg->cqe),
-		 CQE_TYPE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
-		 CQE_OPCODE(rsp_msg->cqe), CQE_LEN(rsp_msg->cqe),
-		 CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
-	rdev_p = (struct cxio_rdev *)t3cdev_p->ulp;
-	if (!rdev_p) {
-		pr_debug("%s called by t3cdev %p with null ulp\n", __func__,
-			 t3cdev_p);
-		return 0;
-	}
-	if (CQE_QPID(rsp_msg->cqe) == T3_CTRL_QP_ID) {
-		rdev_p->ctrl_qp.rptr = CQE_WRID_LOW(rsp_msg->cqe) + 1;
-		wake_up_interruptible(&rdev_p->ctrl_qp.waitq);
-		dev_kfree_skb_irq(skb);
-	} else if (CQE_QPID(rsp_msg->cqe) == 0xfff8)
-		dev_kfree_skb_irq(skb);
-	else if (cxio_ev_cb)
-		(*cxio_ev_cb) (rdev_p, skb);
-	else
-		dev_kfree_skb_irq(skb);
-	cnt++;
-	return 0;
-}
-
-/* Caller takes care of locking if needed */
-int cxio_rdev_open(struct cxio_rdev *rdev_p)
-{
-	struct net_device *netdev_p = NULL;
-	int err = 0;
-	if (strlen(rdev_p->dev_name)) {
-		if (cxio_hal_find_rdev_by_name(rdev_p->dev_name)) {
-			return -EBUSY;
-		}
-		netdev_p = dev_get_by_name(&init_net, rdev_p->dev_name);
-		if (!netdev_p) {
-			return -EINVAL;
-		}
-		dev_put(netdev_p);
-	} else if (rdev_p->t3cdev_p) {
-		if (cxio_hal_find_rdev_by_t3cdev(rdev_p->t3cdev_p)) {
-			return -EBUSY;
-		}
-		netdev_p = rdev_p->t3cdev_p->lldev;
-		strncpy(rdev_p->dev_name, rdev_p->t3cdev_p->name,
-			T3_MAX_DEV_NAME_LEN);
-	} else {
-		pr_debug("%s t3cdev_p or dev_name must be set\n", __func__);
-		return -EINVAL;
-	}
-
-	list_add_tail(&rdev_p->entry, &rdev_list);
-
-	pr_debug("%s opening rnic dev %s\n", __func__, rdev_p->dev_name);
-	memset(&rdev_p->ctrl_qp, 0, sizeof(rdev_p->ctrl_qp));
-	if (!rdev_p->t3cdev_p)
-		rdev_p->t3cdev_p = dev2t3cdev(netdev_p);
-	rdev_p->t3cdev_p->ulp = (void *) rdev_p;
-
-	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_EMBEDDED_INFO,
-					 &(rdev_p->fw_info));
-	if (err) {
-		pr_err("%s t3cdev_p(%p)->ctl returned error %d\n",
-		       __func__, rdev_p->t3cdev_p, err);
-		goto err1;
-	}
-	if (G_FW_VERSION_MAJOR(rdev_p->fw_info.fw_vers) != CXIO_FW_MAJ) {
-		pr_err("fatal firmware version mismatch: need version %u but adapter has version %u\n",
-		       CXIO_FW_MAJ,
-		       G_FW_VERSION_MAJOR(rdev_p->fw_info.fw_vers));
-		err = -EINVAL;
-		goto err1;
-	}
-
-	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_GET_PARAMS,
-					 &(rdev_p->rnic_info));
-	if (err) {
-		pr_err("%s t3cdev_p(%p)->ctl returned error %d\n",
-		       __func__, rdev_p->t3cdev_p, err);
-		goto err1;
-	}
-	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_PORTS,
-				    &(rdev_p->port_info));
-	if (err) {
-		pr_err("%s t3cdev_p(%p)->ctl returned error %d\n",
-		       __func__, rdev_p->t3cdev_p, err);
-		goto err1;
-	}
-
-	/*
-	 * qpshift is the number of bits to shift the qpid left in order
-	 * to get the correct address of the doorbell for that qp.
-	 */
-	cxio_init_ucontext(rdev_p, &rdev_p->uctx);
-	rdev_p->qpshift = PAGE_SHIFT -
-			  ilog2(65536 >>
-			            ilog2(rdev_p->rnic_info.udbell_len >>
-					      PAGE_SHIFT));
-	rdev_p->qpnr = rdev_p->rnic_info.udbell_len >> PAGE_SHIFT;
-	rdev_p->qpmask = (65536 >> ilog2(rdev_p->qpnr)) - 1;
-	pr_debug("%s rnic %s info: tpt_base 0x%0x tpt_top 0x%0x num stags %d pbl_base 0x%0x pbl_top 0x%0x rqt_base 0x%0x, rqt_top 0x%0x\n",
-		 __func__, rdev_p->dev_name, rdev_p->rnic_info.tpt_base,
-		 rdev_p->rnic_info.tpt_top, cxio_num_stags(rdev_p),
-		 rdev_p->rnic_info.pbl_base,
-		 rdev_p->rnic_info.pbl_top, rdev_p->rnic_info.rqt_base,
-		 rdev_p->rnic_info.rqt_top);
-	pr_debug("udbell_len 0x%0x udbell_physbase 0x%lx kdb_addr %p qpshift %lu qpnr %d qpmask 0x%x\n",
-		 rdev_p->rnic_info.udbell_len,
-		 rdev_p->rnic_info.udbell_physbase, rdev_p->rnic_info.kdb_addr,
-		 rdev_p->qpshift, rdev_p->qpnr, rdev_p->qpmask);
-
-	err = cxio_hal_init_ctrl_qp(rdev_p);
-	if (err) {
-		pr_err("%s error %d initializing ctrl_qp\n", __func__, err);
-		goto err1;
-	}
-	err = cxio_hal_init_resource(rdev_p, cxio_num_stags(rdev_p), 0,
-				     0, T3_MAX_NUM_QP, T3_MAX_NUM_CQ,
-				     T3_MAX_NUM_PD);
-	if (err) {
-		pr_err("%s error %d initializing hal resources\n",
-		       __func__, err);
-		goto err2;
-	}
-	err = cxio_hal_pblpool_create(rdev_p);
-	if (err) {
-		pr_err("%s error %d initializing pbl mem pool\n",
-		       __func__, err);
-		goto err3;
-	}
-	err = cxio_hal_rqtpool_create(rdev_p);
-	if (err) {
-		pr_err("%s error %d initializing rqt mem pool\n",
-		       __func__, err);
-		goto err4;
-	}
-	return 0;
-err4:
-	cxio_hal_pblpool_destroy(rdev_p);
-err3:
-	cxio_hal_destroy_resource(rdev_p->rscp);
-err2:
-	cxio_hal_destroy_ctrl_qp(rdev_p);
-err1:
-	rdev_p->t3cdev_p->ulp = NULL;
-	list_del(&rdev_p->entry);
-	return err;
-}
-
-void cxio_rdev_close(struct cxio_rdev *rdev_p)
-{
-	if (rdev_p) {
-		cxio_hal_pblpool_destroy(rdev_p);
-		cxio_hal_rqtpool_destroy(rdev_p);
-		list_del(&rdev_p->entry);
-		cxio_hal_destroy_ctrl_qp(rdev_p);
-		cxio_hal_destroy_resource(rdev_p->rscp);
-		rdev_p->t3cdev_p->ulp = NULL;
-	}
-}
-
-int __init cxio_hal_init(void)
-{
-	if (cxio_hal_init_rhdl_resource(T3_MAX_NUM_RI))
-		return -ENOMEM;
-	t3_register_cpl_handler(CPL_ASYNC_NOTIF, cxio_hal_ev_handler);
-	return 0;
-}
-
-void __exit cxio_hal_exit(void)
-{
-	struct cxio_rdev *rdev, *tmp;
-
-	t3_register_cpl_handler(CPL_ASYNC_NOTIF, NULL);
-	list_for_each_entry_safe(rdev, tmp, &rdev_list, entry)
-		cxio_rdev_close(rdev);
-	cxio_hal_destroy_rhdl_resource();
-}
-
-static void flush_completed_wrs(struct t3_wq *wq, struct t3_cq *cq)
-{
-	struct t3_swsq *sqp;
-	__u32 ptr = wq->sq_rptr;
-	int count = Q_COUNT(wq->sq_rptr, wq->sq_wptr);
-
-	sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
-	while (count--)
-		if (!sqp->signaled) {
-			ptr++;
-			sqp = wq->sq + Q_PTR2IDX(ptr,  wq->sq_size_log2);
-		} else if (sqp->complete) {
-
-			/*
-			 * Insert this completed cqe into the swcq.
-			 */
-			pr_debug("%s moving cqe into swcq sq idx %ld cq idx %ld\n",
-				 __func__, Q_PTR2IDX(ptr,  wq->sq_size_log2),
-				 Q_PTR2IDX(cq->sw_wptr, cq->size_log2));
-			sqp->cqe.header |= htonl(V_CQE_SWCQE(1));
-			*(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2))
-				= sqp->cqe;
-			cq->sw_wptr++;
-			sqp->signaled = 0;
-			break;
-		} else
-			break;
-}
-
-static void create_read_req_cqe(struct t3_wq *wq, struct t3_cqe *hw_cqe,
-				struct t3_cqe *read_cqe)
-{
-	read_cqe->u.scqe.wrid_hi = wq->oldest_read->sq_wptr;
-	read_cqe->len = wq->oldest_read->read_len;
-	read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(*hw_cqe)) |
-				 V_CQE_SWCQE(SW_CQE(*hw_cqe)) |
-				 V_CQE_OPCODE(T3_READ_REQ) |
-				 V_CQE_TYPE(1));
-}
-
-/*
- * Return a ptr to the next read wr in the SWSQ or NULL.
- */
-static void advance_oldest_read(struct t3_wq *wq)
-{
-
-	u32 rptr = wq->oldest_read - wq->sq + 1;
-	u32 wptr = Q_PTR2IDX(wq->sq_wptr, wq->sq_size_log2);
-
-	while (Q_PTR2IDX(rptr, wq->sq_size_log2) != wptr) {
-		wq->oldest_read = wq->sq + Q_PTR2IDX(rptr, wq->sq_size_log2);
-
-		if (wq->oldest_read->opcode == T3_READ_REQ)
-			return;
-		rptr++;
-	}
-	wq->oldest_read = NULL;
-}
-
-/*
- * cxio_poll_cq
- *
- * Caller must:
- *     check the validity of the first CQE,
- *     supply the wq assicated with the qpid.
- *
- * credit: cq credit to return to sge.
- * cqe_flushed: 1 iff the CQE is flushed.
- * cqe: copy of the polled CQE.
- *
- * return value:
- *     0       CQE returned,
- *    -1       CQE skipped, try again.
- */
-int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe,
-		     u8 *cqe_flushed, u64 *cookie, u32 *credit)
-{
-	int ret = 0;
-	struct t3_cqe *hw_cqe, read_cqe;
-
-	*cqe_flushed = 0;
-	*credit = 0;
-	hw_cqe = cxio_next_cqe(cq);
-
-	pr_debug("%s CQE OOO %d qpid 0x%0x genbit %d type %d status 0x%0x opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
-		 __func__, CQE_OOO(*hw_cqe), CQE_QPID(*hw_cqe),
-		 CQE_GENBIT(*hw_cqe), CQE_TYPE(*hw_cqe), CQE_STATUS(*hw_cqe),
-		 CQE_OPCODE(*hw_cqe), CQE_LEN(*hw_cqe), CQE_WRID_HI(*hw_cqe),
-		 CQE_WRID_LOW(*hw_cqe));
-
-	/*
-	 * skip cqe's not affiliated with a QP.
-	 */
-	if (wq == NULL) {
-		ret = -1;
-		goto skip_cqe;
-	}
-
-	/*
-	 * Gotta tweak READ completions:
-	 *	1) the cqe doesn't contain the sq_wptr from the wr.
-	 *	2) opcode not reflected from the wr.
-	 *	3) read_len not reflected from the wr.
-	 *	4) cq_type is RQ_TYPE not SQ_TYPE.
-	 */
-	if (RQ_TYPE(*hw_cqe) && (CQE_OPCODE(*hw_cqe) == T3_READ_RESP)) {
-
-		/*
-		 * If this is an unsolicited read response, then the read
-		 * was generated by the kernel driver as part of peer-2-peer
-		 * connection setup.  So ignore the completion.
-		 */
-		if (!wq->oldest_read) {
-			if (CQE_STATUS(*hw_cqe))
-				wq->error = 1;
-			ret = -1;
-			goto skip_cqe;
-		}
-
-		/*
-		 * Don't write to the HWCQ, so create a new read req CQE
-		 * in local memory.
-		 */
-		create_read_req_cqe(wq, hw_cqe, &read_cqe);
-		hw_cqe = &read_cqe;
-		advance_oldest_read(wq);
-	}
-
-	/*
-	 * T3A: Discard TERMINATE CQEs.
-	 */
-	if (CQE_OPCODE(*hw_cqe) == T3_TERMINATE) {
-		ret = -1;
-		wq->error = 1;
-		goto skip_cqe;
-	}
-
-	if (CQE_STATUS(*hw_cqe) || wq->error) {
-		*cqe_flushed = wq->error;
-		wq->error = 1;
-
-		/*
-		 * T3A inserts errors into the CQE.  We cannot return
-		 * these as work completions.
-		 */
-		/* incoming write failures */
-		if ((CQE_OPCODE(*hw_cqe) == T3_RDMA_WRITE)
-		     && RQ_TYPE(*hw_cqe)) {
-			ret = -1;
-			goto skip_cqe;
-		}
-		/* incoming read request failures */
-		if ((CQE_OPCODE(*hw_cqe) == T3_READ_RESP) && SQ_TYPE(*hw_cqe)) {
-			ret = -1;
-			goto skip_cqe;
-		}
-
-		/* incoming SEND with no receive posted failures */
-		if (CQE_SEND_OPCODE(*hw_cqe) && RQ_TYPE(*hw_cqe) &&
-		    Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) {
-			ret = -1;
-			goto skip_cqe;
-		}
-		BUG_ON((*cqe_flushed == 0) && !SW_CQE(*hw_cqe));
-		goto proc_cqe;
-	}
-
-	/*
-	 * RECV completion.
-	 */
-	if (RQ_TYPE(*hw_cqe)) {
-
-		/*
-		 * HW only validates 4 bits of MSN.  So we must validate that
-		 * the MSN in the SEND is the next expected MSN.  If its not,
-		 * then we complete this with TPT_ERR_MSN and mark the wq in
-		 * error.
-		 */
-
-		if (Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) {
-			wq->error = 1;
-			ret = -1;
-			goto skip_cqe;
-		}
-
-		if (unlikely((CQE_WRID_MSN(*hw_cqe) != (wq->rq_rptr + 1)))) {
-			wq->error = 1;
-			hw_cqe->header |= htonl(V_CQE_STATUS(TPT_ERR_MSN));
-			goto proc_cqe;
-		}
-		goto proc_cqe;
-	}
-
-	/*
-	 * If we get here its a send completion.
-	 *
-	 * Handle out of order completion. These get stuffed
-	 * in the SW SQ. Then the SW SQ is walked to move any
-	 * now in-order completions into the SW CQ.  This handles
-	 * 2 cases:
-	 *	1) reaping unsignaled WRs when the first subsequent
-	 *	   signaled WR is completed.
-	 *	2) out of order read completions.
-	 */
-	if (!SW_CQE(*hw_cqe) && (CQE_WRID_SQ_WPTR(*hw_cqe) != wq->sq_rptr)) {
-		struct t3_swsq *sqp;
-
-		pr_debug("%s out of order completion going in swsq at idx %ld\n",
-			 __func__,
-			 Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe),
-				   wq->sq_size_log2));
-		sqp = wq->sq +
-		      Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2);
-		sqp->cqe = *hw_cqe;
-		sqp->complete = 1;
-		ret = -1;
-		goto flush_wq;
-	}
-
-proc_cqe:
-	*cqe = *hw_cqe;
-
-	/*
-	 * Reap the associated WR(s) that are freed up with this
-	 * completion.
-	 */
-	if (SQ_TYPE(*hw_cqe)) {
-		wq->sq_rptr = CQE_WRID_SQ_WPTR(*hw_cqe);
-		pr_debug("%s completing sq idx %ld\n", __func__,
-			 Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2));
-		*cookie = wq->sq[Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)].wr_id;
-		wq->sq_rptr++;
-	} else {
-		pr_debug("%s completing rq idx %ld\n", __func__,
-			 Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2));
-		*cookie = wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].wr_id;
-		if (wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].pbl_addr)
-			cxio_hal_pblpool_free(wq->rdev,
-				wq->rq[Q_PTR2IDX(wq->rq_rptr,
-				wq->rq_size_log2)].pbl_addr, T3_STAG0_PBL_SIZE);
-		BUG_ON(Q_EMPTY(wq->rq_rptr, wq->rq_wptr));
-		wq->rq_rptr++;
-	}
-
-flush_wq:
-	/*
-	 * Flush any completed cqes that are now in-order.
-	 */
-	flush_completed_wrs(wq, cq);
-
-skip_cqe:
-	if (SW_CQE(*hw_cqe)) {
-		pr_debug("%s cq %p cqid 0x%x skip sw cqe sw_rptr 0x%x\n",
-			 __func__, cq, cq->cqid, cq->sw_rptr);
-		++cq->sw_rptr;
-	} else {
-		pr_debug("%s cq %p cqid 0x%x skip hw cqe rptr 0x%x\n",
-			 __func__, cq, cq->cqid, cq->rptr);
-		++cq->rptr;
-
-		/*
-		 * T3A: compute credits.
-		 */
-		if (((cq->rptr - cq->wptr) > (1 << (cq->size_log2 - 1)))
-		    || ((cq->rptr - cq->wptr) >= 128)) {
-			*credit = cq->rptr - cq->wptr;
-			cq->wptr = cq->rptr;
-		}
-	}
-	return ret;
-}
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h
deleted file mode 100644
index 40c029f..0000000
--- a/drivers/infiniband/hw/cxgb3/cxio_hal.h
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef  __CXIO_HAL_H__
-#define  __CXIO_HAL_H__
-
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/kfifo.h>
-
-#include "t3_cpl.h"
-#include "t3cdev.h"
-#include "cxgb3_ctl_defs.h"
-#include "cxio_wr.h"
-
-#define T3_CTRL_QP_ID    FW_RI_SGEEC_START
-#define T3_CTL_QP_TID	 FW_RI_TID_START
-#define T3_CTRL_QP_SIZE_LOG2  8
-#define T3_CTRL_CQ_ID    0
-
-#define T3_MAX_NUM_RI (1<<15)
-#define T3_MAX_NUM_QP (1<<15)
-#define T3_MAX_NUM_CQ (1<<15)
-#define T3_MAX_NUM_PD (1<<15)
-#define T3_MAX_PBL_SIZE 256
-#define T3_MAX_RQ_SIZE 1024
-#define T3_MAX_QP_DEPTH (T3_MAX_RQ_SIZE-1)
-#define T3_MAX_CQ_DEPTH 65536
-#define T3_MAX_NUM_STAG (1<<15)
-#define T3_MAX_MR_SIZE 0x100000000ULL
-#define T3_PAGESIZE_MASK 0xffff000  /* 4KB-128MB */
-
-#define T3_STAG_UNSET 0xffffffff
-
-#define T3_MAX_DEV_NAME_LEN 32
-
-#define CXIO_FW_MAJ 7
-
-struct cxio_hal_ctrl_qp {
-	u32 wptr;
-	u32 rptr;
-	struct mutex lock;	/* for the wtpr, can sleep */
-	wait_queue_head_t waitq;/* wait for RspQ/CQE msg */
-	union t3_wr *workq;	/* the work request queue */
-	dma_addr_t dma_addr;	/* pci bus address of the workq */
-	DEFINE_DMA_UNMAP_ADDR(mapping);
-	void __iomem *doorbell;
-};
-
-struct cxio_hal_resource {
-	struct kfifo tpt_fifo;
-	spinlock_t tpt_fifo_lock;
-	struct kfifo qpid_fifo;
-	spinlock_t qpid_fifo_lock;
-	struct kfifo cqid_fifo;
-	spinlock_t cqid_fifo_lock;
-	struct kfifo pdid_fifo;
-	spinlock_t pdid_fifo_lock;
-};
-
-struct cxio_qpid_list {
-	struct list_head entry;
-	u32 qpid;
-};
-
-struct cxio_ucontext {
-	struct list_head qpids;
-	struct mutex lock;
-};
-
-struct cxio_rdev {
-	char dev_name[T3_MAX_DEV_NAME_LEN];
-	struct t3cdev *t3cdev_p;
-	struct rdma_info rnic_info;
-	struct adap_ports port_info;
-	struct cxio_hal_resource *rscp;
-	struct cxio_hal_ctrl_qp ctrl_qp;
-	void *ulp;
-	unsigned long qpshift;
-	u32 qpnr;
-	u32 qpmask;
-	struct cxio_ucontext uctx;
-	struct gen_pool *pbl_pool;
-	struct gen_pool *rqt_pool;
-	struct list_head entry;
-	struct ch_embedded_info fw_info;
-	u32	flags;
-#define	CXIO_ERROR_FATAL	1
-};
-
-static inline int cxio_fatal_error(struct cxio_rdev *rdev_p)
-{
-	return rdev_p->flags & CXIO_ERROR_FATAL;
-}
-
-static inline int cxio_num_stags(struct cxio_rdev *rdev_p)
-{
-	return min((int)T3_MAX_NUM_STAG, (int)((rdev_p->rnic_info.tpt_top - rdev_p->rnic_info.tpt_base) >> 5));
-}
-
-typedef void (*cxio_hal_ev_callback_func_t) (struct cxio_rdev * rdev_p,
-					     struct sk_buff * skb);
-
-#define RSPQ_CQID(rsp) (be32_to_cpu(rsp->cq_ptrid) & 0xffff)
-#define RSPQ_CQPTR(rsp) ((be32_to_cpu(rsp->cq_ptrid) >> 16) & 0xffff)
-#define RSPQ_GENBIT(rsp) ((be32_to_cpu(rsp->flags) >> 16) & 1)
-#define RSPQ_OVERFLOW(rsp) ((be32_to_cpu(rsp->flags) >> 17) & 1)
-#define RSPQ_AN(rsp) ((be32_to_cpu(rsp->flags) >> 18) & 1)
-#define RSPQ_SE(rsp) ((be32_to_cpu(rsp->flags) >> 19) & 1)
-#define RSPQ_NOTIFY(rsp) ((be32_to_cpu(rsp->flags) >> 20) & 1)
-#define RSPQ_CQBRANCH(rsp) ((be32_to_cpu(rsp->flags) >> 21) & 1)
-#define RSPQ_CREDIT_THRESH(rsp) ((be32_to_cpu(rsp->flags) >> 22) & 1)
-
-struct respQ_msg_t {
-	__be32 flags;		/* flit 0 */
-	__be32 cq_ptrid;
-	__be64 rsvd;		/* flit 1 */
-	struct t3_cqe cqe;	/* flits 2-3 */
-};
-
-enum t3_cq_opcode {
-	CQ_ARM_AN = 0x2,
-	CQ_ARM_SE = 0x6,
-	CQ_FORCE_AN = 0x3,
-	CQ_CREDIT_UPDATE = 0x7
-};
-
-int cxio_rdev_open(struct cxio_rdev *rdev);
-void cxio_rdev_close(struct cxio_rdev *rdev);
-int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq,
-		   enum t3_cq_opcode op, u32 credit);
-int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel);
-void cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
-void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
-void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
-int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq,
-		   struct cxio_ucontext *uctx);
-int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq,
-		    struct cxio_ucontext *uctx);
-int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode);
-int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl,
-		   u32 pbl_addr, u32 pbl_size);
-int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
-			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
-			   u8 page_size, u32 pbl_size, u32 pbl_addr);
-int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
-			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
-			   u8 page_size, u32 pbl_size, u32 pbl_addr);
-int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size,
-		   u32 pbl_addr);
-int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid);
-int cxio_allocate_stag(struct cxio_rdev *rdev, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr);
-int cxio_deallocate_window(struct cxio_rdev *rdev, u32 stag);
-int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr);
-void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
-void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
-u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp);
-void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid);
-int __init cxio_hal_init(void);
-void __exit cxio_hal_exit(void);
-int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count);
-int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count);
-void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
-void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
-void cxio_flush_hw_cq(struct t3_cq *cq);
-int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe,
-		     u8 *cqe_flushed, u64 *cookie, u32 *credit);
-int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb);
-
-#ifdef pr_fmt
-#undef pr_fmt
-#endif
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#endif
diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c
deleted file mode 100644
index c6e7bc4..0000000
--- a/drivers/infiniband/hw/cxgb3/cxio_resource.c
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-/* Crude resource management */
-#include <linux/kernel.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include <linux/kfifo.h>
-#include <linux/spinlock.h>
-#include <linux/errno.h>
-#include "cxio_resource.h"
-#include "cxio_hal.h"
-
-static struct kfifo rhdl_fifo;
-static spinlock_t rhdl_fifo_lock;
-
-#define RANDOM_SIZE 16
-
-static int __cxio_init_resource_fifo(struct kfifo *fifo,
-				   spinlock_t *fifo_lock,
-				   u32 nr, u32 skip_low,
-				   u32 skip_high,
-				   int random)
-{
-	u32 i, j, entry = 0, idx;
-	u32 random_bytes;
-	u32 rarray[16];
-	spin_lock_init(fifo_lock);
-
-	if (kfifo_alloc(fifo, nr * sizeof(u32), GFP_KERNEL))
-		return -ENOMEM;
-
-	for (i = 0; i < skip_low + skip_high; i++)
-		kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32));
-	if (random) {
-		j = 0;
-		random_bytes = prandom_u32();
-		for (i = 0; i < RANDOM_SIZE; i++)
-			rarray[i] = i + skip_low;
-		for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) {
-			if (j >= RANDOM_SIZE) {
-				j = 0;
-				random_bytes = prandom_u32();
-			}
-			idx = (random_bytes >> (j * 2)) & 0xF;
-			kfifo_in(fifo,
-				(unsigned char *) &rarray[idx],
-				sizeof(u32));
-			rarray[idx] = i;
-			j++;
-		}
-		for (i = 0; i < RANDOM_SIZE; i++)
-			kfifo_in(fifo,
-				(unsigned char *) &rarray[i],
-				sizeof(u32));
-	} else
-		for (i = skip_low; i < nr - skip_high; i++)
-			kfifo_in(fifo, (unsigned char *) &i, sizeof(u32));
-
-	for (i = 0; i < skip_low + skip_high; i++)
-		if (kfifo_out_locked(fifo, (unsigned char *) &entry,
-				sizeof(u32), fifo_lock) != sizeof(u32))
-					break;
-	return 0;
-}
-
-static int cxio_init_resource_fifo(struct kfifo *fifo, spinlock_t * fifo_lock,
-				   u32 nr, u32 skip_low, u32 skip_high)
-{
-	return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
-					  skip_high, 0));
-}
-
-static int cxio_init_resource_fifo_random(struct kfifo *fifo,
-				   spinlock_t * fifo_lock,
-				   u32 nr, u32 skip_low, u32 skip_high)
-{
-
-	return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
-					  skip_high, 1));
-}
-
-static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p)
-{
-	u32 i;
-
-	spin_lock_init(&rdev_p->rscp->qpid_fifo_lock);
-
-	if (kfifo_alloc(&rdev_p->rscp->qpid_fifo, T3_MAX_NUM_QP * sizeof(u32),
-					      GFP_KERNEL))
-		return -ENOMEM;
-
-	for (i = 16; i < T3_MAX_NUM_QP; i++)
-		if (!(i & rdev_p->qpmask))
-			kfifo_in(&rdev_p->rscp->qpid_fifo,
-				    (unsigned char *) &i, sizeof(u32));
-	return 0;
-}
-
-int cxio_hal_init_rhdl_resource(u32 nr_rhdl)
-{
-	return cxio_init_resource_fifo(&rhdl_fifo, &rhdl_fifo_lock, nr_rhdl, 1,
-				       0);
-}
-
-void cxio_hal_destroy_rhdl_resource(void)
-{
-	kfifo_free(&rhdl_fifo);
-}
-
-/* nr_* must be power of 2 */
-int cxio_hal_init_resource(struct cxio_rdev *rdev_p,
-			   u32 nr_tpt, u32 nr_pbl,
-			   u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, u32 nr_pdid)
-{
-	int err = 0;
-	struct cxio_hal_resource *rscp;
-
-	rscp = kmalloc(sizeof(*rscp), GFP_KERNEL);
-	if (!rscp)
-		return -ENOMEM;
-	rdev_p->rscp = rscp;
-	err = cxio_init_resource_fifo_random(&rscp->tpt_fifo,
-				      &rscp->tpt_fifo_lock,
-				      nr_tpt, 1, 0);
-	if (err)
-		goto tpt_err;
-	err = cxio_init_qpid_fifo(rdev_p);
-	if (err)
-		goto qpid_err;
-	err = cxio_init_resource_fifo(&rscp->cqid_fifo, &rscp->cqid_fifo_lock,
-				      nr_cqid, 1, 0);
-	if (err)
-		goto cqid_err;
-	err = cxio_init_resource_fifo(&rscp->pdid_fifo, &rscp->pdid_fifo_lock,
-				      nr_pdid, 1, 0);
-	if (err)
-		goto pdid_err;
-	return 0;
-pdid_err:
-	kfifo_free(&rscp->cqid_fifo);
-cqid_err:
-	kfifo_free(&rscp->qpid_fifo);
-qpid_err:
-	kfifo_free(&rscp->tpt_fifo);
-tpt_err:
-	return -ENOMEM;
-}
-
-/*
- * returns 0 if no resource available
- */
-static u32 cxio_hal_get_resource(struct kfifo *fifo, spinlock_t * lock)
-{
-	u32 entry;
-	if (kfifo_out_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock))
-		return entry;
-	else
-		return 0;	/* fifo emptry */
-}
-
-static void cxio_hal_put_resource(struct kfifo *fifo, spinlock_t * lock,
-		u32 entry)
-{
-	BUG_ON(
-	kfifo_in_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock)
-	== 0);
-}
-
-u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp)
-{
-	return cxio_hal_get_resource(&rscp->tpt_fifo, &rscp->tpt_fifo_lock);
-}
-
-void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag)
-{
-	cxio_hal_put_resource(&rscp->tpt_fifo, &rscp->tpt_fifo_lock, stag);
-}
-
-u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp)
-{
-	u32 qpid = cxio_hal_get_resource(&rscp->qpid_fifo,
-			&rscp->qpid_fifo_lock);
-	pr_debug("%s qpid 0x%x\n", __func__, qpid);
-	return qpid;
-}
-
-void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid)
-{
-	pr_debug("%s qpid 0x%x\n", __func__, qpid);
-	cxio_hal_put_resource(&rscp->qpid_fifo, &rscp->qpid_fifo_lock, qpid);
-}
-
-u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp)
-{
-	return cxio_hal_get_resource(&rscp->cqid_fifo, &rscp->cqid_fifo_lock);
-}
-
-void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid)
-{
-	cxio_hal_put_resource(&rscp->cqid_fifo, &rscp->cqid_fifo_lock, cqid);
-}
-
-u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp)
-{
-	return cxio_hal_get_resource(&rscp->pdid_fifo, &rscp->pdid_fifo_lock);
-}
-
-void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid)
-{
-	cxio_hal_put_resource(&rscp->pdid_fifo, &rscp->pdid_fifo_lock, pdid);
-}
-
-void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp)
-{
-	kfifo_free(&rscp->tpt_fifo);
-	kfifo_free(&rscp->cqid_fifo);
-	kfifo_free(&rscp->qpid_fifo);
-	kfifo_free(&rscp->pdid_fifo);
-	kfree(rscp);
-}
-
-/*
- * PBL Memory Manager.  Uses Linux generic allocator.
- */
-
-#define MIN_PBL_SHIFT 8			/* 256B == min PBL size (32 entries) */
-
-u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size)
-{
-	unsigned long addr = gen_pool_alloc(rdev_p->pbl_pool, size);
-	pr_debug("%s addr 0x%x size %d\n", __func__, (u32)addr, size);
-	return (u32)addr;
-}
-
-void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size)
-{
-	pr_debug("%s addr 0x%x size %d\n", __func__, addr, size);
-	gen_pool_free(rdev_p->pbl_pool, (unsigned long)addr, size);
-}
-
-int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p)
-{
-	unsigned pbl_start, pbl_chunk;
-
-	rdev_p->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1);
-	if (!rdev_p->pbl_pool)
-		return -ENOMEM;
-
-	pbl_start = rdev_p->rnic_info.pbl_base;
-	pbl_chunk = rdev_p->rnic_info.pbl_top - pbl_start + 1;
-
-	while (pbl_start < rdev_p->rnic_info.pbl_top) {
-		pbl_chunk = min(rdev_p->rnic_info.pbl_top - pbl_start + 1,
-				pbl_chunk);
-		if (gen_pool_add(rdev_p->pbl_pool, pbl_start, pbl_chunk, -1)) {
-			pr_debug("%s failed to add PBL chunk (%x/%x)\n",
-				 __func__, pbl_start, pbl_chunk);
-			if (pbl_chunk <= 1024 << MIN_PBL_SHIFT) {
-				pr_warn("%s: Failed to add all PBL chunks (%x/%x)\n",
-					__func__, pbl_start,
-					rdev_p->rnic_info.pbl_top - pbl_start);
-				return 0;
-			}
-			pbl_chunk >>= 1;
-		} else {
-			pr_debug("%s added PBL chunk (%x/%x)\n",
-				 __func__, pbl_start, pbl_chunk);
-			pbl_start += pbl_chunk;
-		}
-	}
-
-	return 0;
-}
-
-void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p)
-{
-	gen_pool_destroy(rdev_p->pbl_pool);
-}
-
-/*
- * RQT Memory Manager.  Uses Linux generic allocator.
- */
-
-#define MIN_RQT_SHIFT 10	/* 1KB == mini RQT size (16 entries) */
-#define RQT_CHUNK 2*1024*1024
-
-u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size)
-{
-	unsigned long addr = gen_pool_alloc(rdev_p->rqt_pool, size << 6);
-	pr_debug("%s addr 0x%x size %d\n", __func__, (u32)addr, size << 6);
-	return (u32)addr;
-}
-
-void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size)
-{
-	pr_debug("%s addr 0x%x size %d\n", __func__, addr, size << 6);
-	gen_pool_free(rdev_p->rqt_pool, (unsigned long)addr, size << 6);
-}
-
-int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p)
-{
-	unsigned long i;
-	rdev_p->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1);
-	if (rdev_p->rqt_pool)
-		for (i = rdev_p->rnic_info.rqt_base;
-		     i <= rdev_p->rnic_info.rqt_top - RQT_CHUNK + 1;
-		     i += RQT_CHUNK)
-			gen_pool_add(rdev_p->rqt_pool, i, RQT_CHUNK, -1);
-	return rdev_p->rqt_pool ? 0 : -ENOMEM;
-}
-
-void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p)
-{
-	gen_pool_destroy(rdev_p->rqt_pool);
-}
diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.h b/drivers/infiniband/hw/cxgb3/cxio_resource.h
deleted file mode 100644
index a2703a3..0000000
--- a/drivers/infiniband/hw/cxgb3/cxio_resource.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __CXIO_RESOURCE_H__
-#define __CXIO_RESOURCE_H__
-
-#include <linux/kernel.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include <linux/kfifo.h>
-#include <linux/spinlock.h>
-#include <linux/errno.h>
-#include <linux/genalloc.h>
-#include "cxio_hal.h"
-
-extern int cxio_hal_init_rhdl_resource(u32 nr_rhdl);
-extern void cxio_hal_destroy_rhdl_resource(void);
-extern int cxio_hal_init_resource(struct cxio_rdev *rdev_p,
-				  u32 nr_tpt, u32 nr_pbl,
-				  u32 nr_rqt, u32 nr_qpid, u32 nr_cqid,
-				  u32 nr_pdid);
-extern u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp);
-extern void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag);
-extern u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp);
-extern void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid);
-extern u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp);
-extern void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid);
-extern void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp);
-
-#define PBL_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.pbl_base )
-extern int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p);
-extern void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p);
-extern u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size);
-extern void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size);
-
-#define RQT_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.rqt_base )
-extern int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p);
-extern void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p);
-extern u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size);
-extern void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size);
-#endif
diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h
deleted file mode 100644
index 53aa5c3..0000000
--- a/drivers/infiniband/hw/cxgb3/cxio_wr.h
+++ /dev/null
@@ -1,802 +0,0 @@
-/*
- * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __CXIO_WR_H__
-#define __CXIO_WR_H__
-
-#include <asm/io.h>
-#include <linux/pci.h>
-#include <linux/timer.h>
-#include "firmware_exports.h"
-
-#define T3_MAX_SGE      4
-#define T3_MAX_INLINE	64
-#define T3_STAG0_PBL_SIZE (2 * T3_MAX_SGE << 3)
-#define T3_STAG0_MAX_PBE_LEN (128 * 1024 * 1024)
-#define T3_STAG0_PAGE_SHIFT 15
-
-#define Q_EMPTY(rptr,wptr) ((rptr)==(wptr))
-#define Q_FULL(rptr,wptr,size_log2)  ( (((wptr)-(rptr))>>(size_log2)) && \
-				       ((rptr)!=(wptr)) )
-#define Q_GENBIT(ptr,size_log2) (!(((ptr)>>size_log2)&0x1))
-#define Q_FREECNT(rptr,wptr,size_log2) ((1UL<<size_log2)-((wptr)-(rptr)))
-#define Q_COUNT(rptr,wptr) ((wptr)-(rptr))
-#define Q_PTR2IDX(ptr,size_log2) (ptr & ((1UL<<size_log2)-1))
-
-static inline void ring_doorbell(void __iomem *doorbell, u32 qpid)
-{
-	writel(((1<<31) | qpid), doorbell);
-}
-
-#define SEQ32_GE(x,y) (!( (((u32) (x)) - ((u32) (y))) & 0x80000000 ))
-
-enum t3_wr_flags {
-	T3_COMPLETION_FLAG = 0x01,
-	T3_NOTIFY_FLAG = 0x02,
-	T3_SOLICITED_EVENT_FLAG = 0x04,
-	T3_READ_FENCE_FLAG = 0x08,
-	T3_LOCAL_FENCE_FLAG = 0x10
-} __packed;
-
-enum t3_wr_opcode {
-	T3_WR_BP = FW_WROPCODE_RI_BYPASS,
-	T3_WR_SEND = FW_WROPCODE_RI_SEND,
-	T3_WR_WRITE = FW_WROPCODE_RI_RDMA_WRITE,
-	T3_WR_READ = FW_WROPCODE_RI_RDMA_READ,
-	T3_WR_INV_STAG = FW_WROPCODE_RI_LOCAL_INV,
-	T3_WR_BIND = FW_WROPCODE_RI_BIND_MW,
-	T3_WR_RCV = FW_WROPCODE_RI_RECEIVE,
-	T3_WR_INIT = FW_WROPCODE_RI_RDMA_INIT,
-	T3_WR_QP_MOD = FW_WROPCODE_RI_MODIFY_QP,
-	T3_WR_FASTREG = FW_WROPCODE_RI_FASTREGISTER_MR
-} __packed;
-
-enum t3_rdma_opcode {
-	T3_RDMA_WRITE,		/* IETF RDMAP v1.0 ... */
-	T3_READ_REQ,
-	T3_READ_RESP,
-	T3_SEND,
-	T3_SEND_WITH_INV,
-	T3_SEND_WITH_SE,
-	T3_SEND_WITH_SE_INV,
-	T3_TERMINATE,
-	T3_RDMA_INIT,		/* CHELSIO RI specific ... */
-	T3_BIND_MW,
-	T3_FAST_REGISTER,
-	T3_LOCAL_INV,
-	T3_QP_MOD,
-	T3_BYPASS,
-	T3_RDMA_READ_REQ_WITH_INV,
-} __packed;
-
-static inline enum t3_rdma_opcode wr2opcode(enum t3_wr_opcode wrop)
-{
-	switch (wrop) {
-		case T3_WR_BP: return T3_BYPASS;
-		case T3_WR_SEND: return T3_SEND;
-		case T3_WR_WRITE: return T3_RDMA_WRITE;
-		case T3_WR_READ: return T3_READ_REQ;
-		case T3_WR_INV_STAG: return T3_LOCAL_INV;
-		case T3_WR_BIND: return T3_BIND_MW;
-		case T3_WR_INIT: return T3_RDMA_INIT;
-		case T3_WR_QP_MOD: return T3_QP_MOD;
-		case T3_WR_FASTREG: return T3_FAST_REGISTER;
-		default: break;
-	}
-	return -1;
-}
-
-
-/* Work request id */
-union t3_wrid {
-	struct {
-		u32 hi;
-		u32 low;
-	} id0;
-	u64 id1;
-};
-
-#define WRID(wrid)		(wrid.id1)
-#define WRID_GEN(wrid)		(wrid.id0.wr_gen)
-#define WRID_IDX(wrid)		(wrid.id0.wr_idx)
-#define WRID_LO(wrid)		(wrid.id0.wr_lo)
-
-struct fw_riwrh {
-	__be32 op_seop_flags;
-	__be32 gen_tid_len;
-};
-
-#define S_FW_RIWR_OP		24
-#define M_FW_RIWR_OP		0xff
-#define V_FW_RIWR_OP(x)		((x) << S_FW_RIWR_OP)
-#define G_FW_RIWR_OP(x)	((((x) >> S_FW_RIWR_OP)) & M_FW_RIWR_OP)
-
-#define S_FW_RIWR_SOPEOP	22
-#define M_FW_RIWR_SOPEOP	0x3
-#define V_FW_RIWR_SOPEOP(x)	((x) << S_FW_RIWR_SOPEOP)
-
-#define S_FW_RIWR_FLAGS		8
-#define M_FW_RIWR_FLAGS		0x3fffff
-#define V_FW_RIWR_FLAGS(x)	((x) << S_FW_RIWR_FLAGS)
-#define G_FW_RIWR_FLAGS(x)	((((x) >> S_FW_RIWR_FLAGS)) & M_FW_RIWR_FLAGS)
-
-#define S_FW_RIWR_TID		8
-#define V_FW_RIWR_TID(x)	((x) << S_FW_RIWR_TID)
-
-#define S_FW_RIWR_LEN		0
-#define V_FW_RIWR_LEN(x)	((x) << S_FW_RIWR_LEN)
-
-#define S_FW_RIWR_GEN           31
-#define V_FW_RIWR_GEN(x)        ((x)  << S_FW_RIWR_GEN)
-
-struct t3_sge {
-	__be32 stag;
-	__be32 len;
-	__be64 to;
-};
-
-/* If num_sgle is zero, flit 5+ contains immediate data.*/
-struct t3_send_wr {
-	struct fw_riwrh wrh;	/* 0 */
-	union t3_wrid wrid;	/* 1 */
-
-	u8 rdmaop;		/* 2 */
-	u8 reserved[3];
-	__be32 rem_stag;
-	__be32 plen;		/* 3 */
-	__be32 num_sgle;
-	struct t3_sge sgl[T3_MAX_SGE];	/* 4+ */
-};
-
-#define T3_MAX_FASTREG_DEPTH 10
-#define T3_MAX_FASTREG_FRAG 10
-
-struct t3_fastreg_wr {
-	struct fw_riwrh wrh;	/* 0 */
-	union t3_wrid wrid;	/* 1 */
-	__be32 stag;		/* 2 */
-	__be32 len;
-	__be32 va_base_hi;	/* 3 */
-	__be32 va_base_lo_fbo;
-	__be32 page_type_perms; /* 4 */
-	__be32 reserved1;
-	__be64 pbl_addrs[0];	/* 5+ */
-};
-
-/*
- * If a fastreg wr spans multiple wqes, then the 2nd fragment look like this.
- */
-struct t3_pbl_frag {
-	struct fw_riwrh wrh;	/* 0 */
-	__be64 pbl_addrs[14];	/* 1..14 */
-};
-
-#define S_FR_PAGE_COUNT		24
-#define M_FR_PAGE_COUNT		0xff
-#define V_FR_PAGE_COUNT(x)	((x) << S_FR_PAGE_COUNT)
-#define G_FR_PAGE_COUNT(x)	((((x) >> S_FR_PAGE_COUNT)) & M_FR_PAGE_COUNT)
-
-#define S_FR_PAGE_SIZE		16
-#define M_FR_PAGE_SIZE		0x1f
-#define V_FR_PAGE_SIZE(x)	((x) << S_FR_PAGE_SIZE)
-#define G_FR_PAGE_SIZE(x)	((((x) >> S_FR_PAGE_SIZE)) & M_FR_PAGE_SIZE)
-
-#define S_FR_TYPE		8
-#define M_FR_TYPE		0x1
-#define V_FR_TYPE(x)		((x) << S_FR_TYPE)
-#define G_FR_TYPE(x)		((((x) >> S_FR_TYPE)) & M_FR_TYPE)
-
-#define S_FR_PERMS		0
-#define M_FR_PERMS		0xff
-#define V_FR_PERMS(x)		((x) << S_FR_PERMS)
-#define G_FR_PERMS(x)		((((x) >> S_FR_PERMS)) & M_FR_PERMS)
-
-struct t3_local_inv_wr {
-	struct fw_riwrh wrh;	/* 0 */
-	union t3_wrid wrid;	/* 1 */
-	__be32 stag;		/* 2 */
-	__be32 reserved;
-};
-
-struct t3_rdma_write_wr {
-	struct fw_riwrh wrh;	/* 0 */
-	union t3_wrid wrid;	/* 1 */
-	u8 rdmaop;		/* 2 */
-	u8 reserved[3];
-	__be32 stag_sink;
-	__be64 to_sink;		/* 3 */
-	__be32 plen;		/* 4 */
-	__be32 num_sgle;
-	struct t3_sge sgl[T3_MAX_SGE];	/* 5+ */
-};
-
-struct t3_rdma_read_wr {
-	struct fw_riwrh wrh;	/* 0 */
-	union t3_wrid wrid;	/* 1 */
-	u8 rdmaop;		/* 2 */
-	u8 local_inv;
-	u8 reserved[2];
-	__be32 rem_stag;
-	__be64 rem_to;		/* 3 */
-	__be32 local_stag;	/* 4 */
-	__be32 local_len;
-	__be64 local_to;	/* 5 */
-};
-
-struct t3_bind_mw_wr {
-	struct fw_riwrh wrh;	/* 0 */
-	union t3_wrid wrid;	/* 1 */
-	u16 reserved;		/* 2 */
-	u8 type;
-	u8 perms;
-	__be32 mr_stag;
-	__be32 mw_stag;		/* 3 */
-	__be32 mw_len;
-	__be64 mw_va;		/* 4 */
-	__be32 mr_pbl_addr;	/* 5 */
-	u8 reserved2[3];
-	u8 mr_pagesz;
-};
-
-struct t3_receive_wr {
-	struct fw_riwrh wrh;	/* 0 */
-	union t3_wrid wrid;	/* 1 */
-	u8 pagesz[T3_MAX_SGE];
-	__be32 num_sgle;		/* 2 */
-	struct t3_sge sgl[T3_MAX_SGE];	/* 3+ */
-	__be32 pbl_addr[T3_MAX_SGE];
-};
-
-struct t3_bypass_wr {
-	struct fw_riwrh wrh;
-	union t3_wrid wrid;	/* 1 */
-};
-
-struct t3_modify_qp_wr {
-	struct fw_riwrh wrh;	/* 0 */
-	union t3_wrid wrid;	/* 1 */
-	__be32 flags;		/* 2 */
-	__be32 quiesce;		/* 2 */
-	__be32 max_ird;		/* 3 */
-	__be32 max_ord;		/* 3 */
-	__be64 sge_cmd;		/* 4 */
-	__be64 ctx1;		/* 5 */
-	__be64 ctx0;		/* 6 */
-};
-
-enum t3_modify_qp_flags {
-	MODQP_QUIESCE  = 0x01,
-	MODQP_MAX_IRD  = 0x02,
-	MODQP_MAX_ORD  = 0x04,
-	MODQP_WRITE_EC = 0x08,
-	MODQP_READ_EC  = 0x10,
-};
-
-
-enum t3_mpa_attrs {
-	uP_RI_MPA_RX_MARKER_ENABLE = 0x1,
-	uP_RI_MPA_TX_MARKER_ENABLE = 0x2,
-	uP_RI_MPA_CRC_ENABLE = 0x4,
-	uP_RI_MPA_IETF_ENABLE = 0x8
-} __packed;
-
-enum t3_qp_caps {
-	uP_RI_QP_RDMA_READ_ENABLE = 0x01,
-	uP_RI_QP_RDMA_WRITE_ENABLE = 0x02,
-	uP_RI_QP_BIND_ENABLE = 0x04,
-	uP_RI_QP_FAST_REGISTER_ENABLE = 0x08,
-	uP_RI_QP_STAG0_ENABLE = 0x10
-} __packed;
-
-enum rdma_init_rtr_types {
-	RTR_READ = 1,
-	RTR_WRITE = 2,
-	RTR_SEND = 3,
-};
-
-#define S_RTR_TYPE	2
-#define M_RTR_TYPE	0x3
-#define V_RTR_TYPE(x)	((x) << S_RTR_TYPE)
-#define G_RTR_TYPE(x)	((((x) >> S_RTR_TYPE)) & M_RTR_TYPE)
-
-#define S_CHAN		4
-#define M_CHAN		0x3
-#define V_CHAN(x)	((x) << S_CHAN)
-#define G_CHAN(x)	((((x) >> S_CHAN)) & M_CHAN)
-
-struct t3_rdma_init_attr {
-	u32 tid;
-	u32 qpid;
-	u32 pdid;
-	u32 scqid;
-	u32 rcqid;
-	u32 rq_addr;
-	u32 rq_size;
-	enum t3_mpa_attrs mpaattrs;
-	enum t3_qp_caps qpcaps;
-	u16 tcp_emss;
-	u32 ord;
-	u32 ird;
-	u64 qp_dma_addr;
-	u32 qp_dma_size;
-	enum rdma_init_rtr_types rtr_type;
-	u16 flags;
-	u16 rqe_count;
-	u32 irs;
-	u32 chan;
-};
-
-struct t3_rdma_init_wr {
-	struct fw_riwrh wrh;	/* 0 */
-	union t3_wrid wrid;	/* 1 */
-	__be32 qpid;		/* 2 */
-	__be32 pdid;
-	__be32 scqid;		/* 3 */
-	__be32 rcqid;
-	__be32 rq_addr;		/* 4 */
-	__be32 rq_size;
-	u8 mpaattrs;		/* 5 */
-	u8 qpcaps;
-	__be16 ulpdu_size;
-	__be16 flags_rtr_type;
-	__be16 rqe_count;
-	__be32 ord;		/* 6 */
-	__be32 ird;
-	__be64 qp_dma_addr;	/* 7 */
-	__be32 qp_dma_size;	/* 8 */
-	__be32 irs;
-};
-
-struct t3_genbit {
-	u64 flit[15];
-	__be64 genbit;
-};
-
-struct t3_wq_in_err {
-	u64 flit[13];
-	u64 err;
-};
-
-enum rdma_init_wr_flags {
-	MPA_INITIATOR = (1<<0),
-	PRIV_QP = (1<<1),
-};
-
-union t3_wr {
-	struct t3_send_wr send;
-	struct t3_rdma_write_wr write;
-	struct t3_rdma_read_wr read;
-	struct t3_receive_wr recv;
-	struct t3_fastreg_wr fastreg;
-	struct t3_pbl_frag pbl_frag;
-	struct t3_local_inv_wr local_inv;
-	struct t3_bind_mw_wr bind;
-	struct t3_bypass_wr bypass;
-	struct t3_rdma_init_wr init;
-	struct t3_modify_qp_wr qp_mod;
-	struct t3_genbit genbit;
-	struct t3_wq_in_err wq_in_err;
-	__be64 flit[16];
-};
-
-#define T3_SQ_CQE_FLIT	  13
-#define T3_SQ_COOKIE_FLIT 14
-
-#define T3_RQ_COOKIE_FLIT 13
-#define T3_RQ_CQE_FLIT	  14
-
-static inline enum t3_wr_opcode fw_riwrh_opcode(struct fw_riwrh *wqe)
-{
-	return G_FW_RIWR_OP(be32_to_cpu(wqe->op_seop_flags));
-}
-
-enum t3_wr_hdr_bits {
-	T3_EOP = 1,
-	T3_SOP = 2,
-	T3_SOPEOP = T3_EOP|T3_SOP,
-};
-
-static inline void build_fw_riwrh(struct fw_riwrh *wqe, enum t3_wr_opcode op,
-				  enum t3_wr_flags flags, u8 genbit, u32 tid,
-				  u8 len, u8 sopeop)
-{
-	wqe->op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(op) |
-					 V_FW_RIWR_SOPEOP(sopeop) |
-					 V_FW_RIWR_FLAGS(flags));
-	wmb();
-	wqe->gen_tid_len = cpu_to_be32(V_FW_RIWR_GEN(genbit) |
-				       V_FW_RIWR_TID(tid) |
-				       V_FW_RIWR_LEN(len));
-	/* 2nd gen bit... */
-	((union t3_wr *)wqe)->genbit.genbit = cpu_to_be64(genbit);
-}
-
-/*
- * T3 ULP2_TX commands
- */
-enum t3_utx_mem_op {
-	T3_UTX_MEM_READ = 2,
-	T3_UTX_MEM_WRITE = 3
-};
-
-/* T3 MC7 RDMA TPT entry format */
-
-enum tpt_mem_type {
-	TPT_NON_SHARED_MR = 0x0,
-	TPT_SHARED_MR = 0x1,
-	TPT_MW = 0x2,
-	TPT_MW_RELAXED_PROTECTION = 0x3
-};
-
-enum tpt_addr_type {
-	TPT_ZBTO = 0,
-	TPT_VATO = 1
-};
-
-enum tpt_mem_perm {
-	TPT_MW_BIND = 0x10,
-	TPT_LOCAL_READ = 0x8,
-	TPT_LOCAL_WRITE = 0x4,
-	TPT_REMOTE_READ = 0x2,
-	TPT_REMOTE_WRITE = 0x1
-};
-
-struct tpt_entry {
-	__be32 valid_stag_pdid;
-	__be32 flags_pagesize_qpid;
-
-	__be32 rsvd_pbl_addr;
-	__be32 len;
-	__be32 va_hi;
-	__be32 va_low_or_fbo;
-
-	__be32 rsvd_bind_cnt_or_pstag;
-	__be32 rsvd_pbl_size;
-};
-
-#define S_TPT_VALID		31
-#define V_TPT_VALID(x)		((x) << S_TPT_VALID)
-#define F_TPT_VALID		V_TPT_VALID(1U)
-
-#define S_TPT_STAG_KEY		23
-#define M_TPT_STAG_KEY		0xFF
-#define V_TPT_STAG_KEY(x)	((x) << S_TPT_STAG_KEY)
-#define G_TPT_STAG_KEY(x)	(((x) >> S_TPT_STAG_KEY) & M_TPT_STAG_KEY)
-
-#define S_TPT_STAG_STATE	22
-#define V_TPT_STAG_STATE(x)	((x) << S_TPT_STAG_STATE)
-#define F_TPT_STAG_STATE	V_TPT_STAG_STATE(1U)
-
-#define S_TPT_STAG_TYPE		20
-#define M_TPT_STAG_TYPE		0x3
-#define V_TPT_STAG_TYPE(x)	((x) << S_TPT_STAG_TYPE)
-#define G_TPT_STAG_TYPE(x)	(((x) >> S_TPT_STAG_TYPE) & M_TPT_STAG_TYPE)
-
-#define S_TPT_PDID		0
-#define M_TPT_PDID		0xFFFFF
-#define V_TPT_PDID(x)		((x) << S_TPT_PDID)
-#define G_TPT_PDID(x)		(((x) >> S_TPT_PDID) & M_TPT_PDID)
-
-#define S_TPT_PERM		28
-#define M_TPT_PERM		0xF
-#define V_TPT_PERM(x)		((x) << S_TPT_PERM)
-#define G_TPT_PERM(x)		(((x) >> S_TPT_PERM) & M_TPT_PERM)
-
-#define S_TPT_REM_INV_DIS	27
-#define V_TPT_REM_INV_DIS(x)	((x) << S_TPT_REM_INV_DIS)
-#define F_TPT_REM_INV_DIS	V_TPT_REM_INV_DIS(1U)
-
-#define S_TPT_ADDR_TYPE		26
-#define V_TPT_ADDR_TYPE(x)	((x) << S_TPT_ADDR_TYPE)
-#define F_TPT_ADDR_TYPE		V_TPT_ADDR_TYPE(1U)
-
-#define S_TPT_MW_BIND_ENABLE	25
-#define V_TPT_MW_BIND_ENABLE(x)	((x) << S_TPT_MW_BIND_ENABLE)
-#define F_TPT_MW_BIND_ENABLE    V_TPT_MW_BIND_ENABLE(1U)
-
-#define S_TPT_PAGE_SIZE		20
-#define M_TPT_PAGE_SIZE		0x1F
-#define V_TPT_PAGE_SIZE(x)	((x) << S_TPT_PAGE_SIZE)
-#define G_TPT_PAGE_SIZE(x)	(((x) >> S_TPT_PAGE_SIZE) & M_TPT_PAGE_SIZE)
-
-#define S_TPT_PBL_ADDR		0
-#define M_TPT_PBL_ADDR		0x1FFFFFFF
-#define V_TPT_PBL_ADDR(x)	((x) << S_TPT_PBL_ADDR)
-#define G_TPT_PBL_ADDR(x)       (((x) >> S_TPT_PBL_ADDR) & M_TPT_PBL_ADDR)
-
-#define S_TPT_QPID		0
-#define M_TPT_QPID		0xFFFFF
-#define V_TPT_QPID(x)		((x) << S_TPT_QPID)
-#define G_TPT_QPID(x)		(((x) >> S_TPT_QPID) & M_TPT_QPID)
-
-#define S_TPT_PSTAG		0
-#define M_TPT_PSTAG		0xFFFFFF
-#define V_TPT_PSTAG(x)		((x) << S_TPT_PSTAG)
-#define G_TPT_PSTAG(x)		(((x) >> S_TPT_PSTAG) & M_TPT_PSTAG)
-
-#define S_TPT_PBL_SIZE		0
-#define M_TPT_PBL_SIZE		0xFFFFF
-#define V_TPT_PBL_SIZE(x)	((x) << S_TPT_PBL_SIZE)
-#define G_TPT_PBL_SIZE(x)	(((x) >> S_TPT_PBL_SIZE) & M_TPT_PBL_SIZE)
-
-/*
- * CQE defs
- */
-struct t3_cqe {
-	__be32 header;
-	__be32 len;
-	union {
-		struct {
-			__be32 stag;
-			__be32 msn;
-		} rcqe;
-		struct {
-			u32 wrid_hi;
-			u32 wrid_low;
-		} scqe;
-	} u;
-};
-
-#define S_CQE_OOO	  31
-#define M_CQE_OOO	  0x1
-#define G_CQE_OOO(x)	  ((((x) >> S_CQE_OOO)) & M_CQE_OOO)
-#define V_CEQ_OOO(x)	  ((x)<<S_CQE_OOO)
-
-#define S_CQE_QPID        12
-#define M_CQE_QPID        0x7FFFF
-#define G_CQE_QPID(x)     ((((x) >> S_CQE_QPID)) & M_CQE_QPID)
-#define V_CQE_QPID(x)	  ((x)<<S_CQE_QPID)
-
-#define S_CQE_SWCQE       11
-#define M_CQE_SWCQE       0x1
-#define G_CQE_SWCQE(x)    ((((x) >> S_CQE_SWCQE)) & M_CQE_SWCQE)
-#define V_CQE_SWCQE(x)	  ((x)<<S_CQE_SWCQE)
-
-#define S_CQE_GENBIT      10
-#define M_CQE_GENBIT      0x1
-#define G_CQE_GENBIT(x)   (((x) >> S_CQE_GENBIT) & M_CQE_GENBIT)
-#define V_CQE_GENBIT(x)	  ((x)<<S_CQE_GENBIT)
-
-#define S_CQE_STATUS      5
-#define M_CQE_STATUS      0x1F
-#define G_CQE_STATUS(x)   ((((x) >> S_CQE_STATUS)) & M_CQE_STATUS)
-#define V_CQE_STATUS(x)   ((x)<<S_CQE_STATUS)
-
-#define S_CQE_TYPE        4
-#define M_CQE_TYPE        0x1
-#define G_CQE_TYPE(x)     ((((x) >> S_CQE_TYPE)) & M_CQE_TYPE)
-#define V_CQE_TYPE(x)     ((x)<<S_CQE_TYPE)
-
-#define S_CQE_OPCODE      0
-#define M_CQE_OPCODE      0xF
-#define G_CQE_OPCODE(x)   ((((x) >> S_CQE_OPCODE)) & M_CQE_OPCODE)
-#define V_CQE_OPCODE(x)   ((x)<<S_CQE_OPCODE)
-
-#define SW_CQE(x)         (G_CQE_SWCQE(be32_to_cpu((x).header)))
-#define CQE_OOO(x)        (G_CQE_OOO(be32_to_cpu((x).header)))
-#define CQE_QPID(x)       (G_CQE_QPID(be32_to_cpu((x).header)))
-#define CQE_GENBIT(x)     (G_CQE_GENBIT(be32_to_cpu((x).header)))
-#define CQE_TYPE(x)       (G_CQE_TYPE(be32_to_cpu((x).header)))
-#define SQ_TYPE(x)	  (CQE_TYPE((x)))
-#define RQ_TYPE(x)	  (!CQE_TYPE((x)))
-#define CQE_STATUS(x)     (G_CQE_STATUS(be32_to_cpu((x).header)))
-#define CQE_OPCODE(x)     (G_CQE_OPCODE(be32_to_cpu((x).header)))
-
-#define CQE_SEND_OPCODE(x)( \
-	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND) || \
-	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_SE) || \
-	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_INV) || \
-	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_SE_INV))
-
-#define CQE_LEN(x)        (be32_to_cpu((x).len))
-
-/* used for RQ completion processing */
-#define CQE_WRID_STAG(x)  (be32_to_cpu((x).u.rcqe.stag))
-#define CQE_WRID_MSN(x)   (be32_to_cpu((x).u.rcqe.msn))
-
-/* used for SQ completion processing */
-#define CQE_WRID_SQ_WPTR(x)	((x).u.scqe.wrid_hi)
-#define CQE_WRID_WPTR(x)	((x).u.scqe.wrid_low)
-
-/* generic accessor macros */
-#define CQE_WRID_HI(x)		((x).u.scqe.wrid_hi)
-#define CQE_WRID_LOW(x)		((x).u.scqe.wrid_low)
-
-#define TPT_ERR_SUCCESS                     0x0
-#define TPT_ERR_STAG                        0x1	 /* STAG invalid: either the */
-						 /* STAG is offlimt, being 0, */
-						 /* or STAG_key mismatch */
-#define TPT_ERR_PDID                        0x2	 /* PDID mismatch */
-#define TPT_ERR_QPID                        0x3	 /* QPID mismatch */
-#define TPT_ERR_ACCESS                      0x4	 /* Invalid access right */
-#define TPT_ERR_WRAP                        0x5	 /* Wrap error */
-#define TPT_ERR_BOUND                       0x6	 /* base and bounds voilation */
-#define TPT_ERR_INVALIDATE_SHARED_MR        0x7	 /* attempt to invalidate a  */
-						 /* shared memory region */
-#define TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND 0x8	 /* attempt to invalidate a  */
-						 /* shared memory region */
-#define TPT_ERR_ECC                         0x9	 /* ECC error detected */
-#define TPT_ERR_ECC_PSTAG                   0xA	 /* ECC error detected when  */
-						 /* reading PSTAG for a MW  */
-						 /* Invalidate */
-#define TPT_ERR_PBL_ADDR_BOUND              0xB	 /* pbl addr out of bounds:  */
-						 /* software error */
-#define TPT_ERR_SWFLUSH			    0xC	 /* SW FLUSHED */
-#define TPT_ERR_CRC                         0x10 /* CRC error */
-#define TPT_ERR_MARKER                      0x11 /* Marker error */
-#define TPT_ERR_PDU_LEN_ERR                 0x12 /* invalid PDU length */
-#define TPT_ERR_OUT_OF_RQE                  0x13 /* out of RQE */
-#define TPT_ERR_DDP_VERSION                 0x14 /* wrong DDP version */
-#define TPT_ERR_RDMA_VERSION                0x15 /* wrong RDMA version */
-#define TPT_ERR_OPCODE                      0x16 /* invalid rdma opcode */
-#define TPT_ERR_DDP_QUEUE_NUM               0x17 /* invalid ddp queue number */
-#define TPT_ERR_MSN                         0x18 /* MSN error */
-#define TPT_ERR_TBIT                        0x19 /* tag bit not set correctly */
-#define TPT_ERR_MO                          0x1A /* MO not 0 for TERMINATE  */
-						 /* or READ_REQ */
-#define TPT_ERR_MSN_GAP                     0x1B
-#define TPT_ERR_MSN_RANGE                   0x1C
-#define TPT_ERR_IRD_OVERFLOW                0x1D
-#define TPT_ERR_RQE_ADDR_BOUND              0x1E /* RQE addr out of bounds:  */
-						 /* software error */
-#define TPT_ERR_INTERNAL_ERR                0x1F /* internal error (opcode  */
-						 /* mismatch) */
-
-struct t3_swsq {
-	__u64			wr_id;
-	struct t3_cqe		cqe;
-	__u32			sq_wptr;
-	__be32			read_len;
-	int			opcode;
-	int			complete;
-	int			signaled;
-};
-
-struct t3_swrq {
-	__u64			wr_id;
-	__u32			pbl_addr;
-};
-
-/*
- * A T3 WQ implements both the SQ and RQ.
- */
-struct t3_wq {
-	union t3_wr *queue;		/* DMA accessible memory */
-	dma_addr_t dma_addr;		/* DMA address for HW */
-	DEFINE_DMA_UNMAP_ADDR(mapping); /* unmap kruft */
-	u32 error;			/* 1 once we go to ERROR */
-	u32 qpid;
-	u32 wptr;			/* idx to next available WR slot */
-	u32 size_log2;			/* total wq size */
-	struct t3_swsq *sq;		/* SW SQ */
-	struct t3_swsq *oldest_read;	/* tracks oldest pending read */
-	u32 sq_wptr;			/* sq_wptr - sq_rptr == count of */
-	u32 sq_rptr;			/* pending wrs */
-	u32 sq_size_log2;		/* sq size */
-	struct t3_swrq *rq;		/* SW RQ (holds consumer wr_ids */
-	u32 rq_wptr;			/* rq_wptr - rq_rptr == count of */
-	u32 rq_rptr;			/* pending wrs */
-	struct t3_swrq *rq_oldest_wr;	/* oldest wr on the SW RQ */
-	u32 rq_size_log2;		/* rq size */
-	u32 rq_addr;			/* rq adapter address */
-	void __iomem *doorbell;		/* kernel db */
-	u64 udb;			/* user db if any */
-	struct cxio_rdev *rdev;
-};
-
-struct t3_cq {
-	u32 cqid;
-	u32 rptr;
-	u32 wptr;
-	u32 size_log2;
-	dma_addr_t dma_addr;
-	DEFINE_DMA_UNMAP_ADDR(mapping);
-	struct t3_cqe *queue;
-	struct t3_cqe *sw_queue;
-	u32 sw_rptr;
-	u32 sw_wptr;
-};
-
-#define CQ_VLD_ENTRY(ptr,size_log2,cqe) (Q_GENBIT(ptr,size_log2) == \
-					 CQE_GENBIT(*cqe))
-
-struct t3_cq_status_page {
-	u32 cq_err;
-};
-
-static inline int cxio_cq_in_error(struct t3_cq *cq)
-{
-	return ((struct t3_cq_status_page *)
-		&cq->queue[1 << cq->size_log2])->cq_err;
-}
-
-static inline void cxio_set_cq_in_error(struct t3_cq *cq)
-{
-	((struct t3_cq_status_page *)
-	 &cq->queue[1 << cq->size_log2])->cq_err = 1;
-}
-
-static inline void cxio_set_wq_in_error(struct t3_wq *wq)
-{
-	wq->queue->wq_in_err.err |= 1;
-}
-
-static inline void cxio_disable_wq_db(struct t3_wq *wq)
-{
-	wq->queue->wq_in_err.err |= 2;
-}
-
-static inline void cxio_enable_wq_db(struct t3_wq *wq)
-{
-	wq->queue->wq_in_err.err &= ~2;
-}
-
-static inline int cxio_wq_db_enabled(struct t3_wq *wq)
-{
-	return !(wq->queue->wq_in_err.err & 2);
-}
-
-static inline struct t3_cqe *cxio_next_hw_cqe(struct t3_cq *cq)
-{
-	struct t3_cqe *cqe;
-
-	cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2));
-	if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe))
-		return cqe;
-	return NULL;
-}
-
-static inline struct t3_cqe *cxio_next_sw_cqe(struct t3_cq *cq)
-{
-	struct t3_cqe *cqe;
-
-	if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) {
-		cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2));
-		return cqe;
-	}
-	return NULL;
-}
-
-static inline struct t3_cqe *cxio_next_cqe(struct t3_cq *cq)
-{
-	struct t3_cqe *cqe;
-
-	if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) {
-		cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2));
-		return cqe;
-	}
-	cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2));
-	if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe))
-		return cqe;
-	return NULL;
-}
-
-#endif
diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c
deleted file mode 100644
index 56a8ab6..0000000
--- a/drivers/infiniband/hw/cxgb3/iwch.c
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-
-#include <rdma/ib_verbs.h>
-
-#include "cxgb3_offload.h"
-#include "iwch_provider.h"
-#include <rdma/cxgb3-abi.h>
-#include "iwch.h"
-#include "iwch_cm.h"
-
-#define DRV_VERSION "1.1"
-
-MODULE_AUTHOR("Boyd Faulkner, Steve Wise");
-MODULE_DESCRIPTION("Chelsio T3 RDMA Driver");
-MODULE_LICENSE("Dual BSD/GPL");
-
-static void open_rnic_dev(struct t3cdev *);
-static void close_rnic_dev(struct t3cdev *);
-static void iwch_event_handler(struct t3cdev *, u32, u32);
-
-struct cxgb3_client t3c_client = {
-	.name = "iw_cxgb3",
-	.add = open_rnic_dev,
-	.remove = close_rnic_dev,
-	.handlers = t3c_handlers,
-	.redirect = iwch_ep_redirect,
-	.event_handler = iwch_event_handler
-};
-
-static LIST_HEAD(dev_list);
-static DEFINE_MUTEX(dev_mutex);
-
-static void disable_dbs(struct iwch_dev *rnicp)
-{
-	unsigned long index;
-	struct iwch_qp *qhp;
-
-	xa_lock_irq(&rnicp->qps);
-	xa_for_each(&rnicp->qps, index, qhp)
-		cxio_disable_wq_db(&qhp->wq);
-	xa_unlock_irq(&rnicp->qps);
-}
-
-static void enable_dbs(struct iwch_dev *rnicp, int ring_db)
-{
-	unsigned long index;
-	struct iwch_qp *qhp;
-
-	xa_lock_irq(&rnicp->qps);
-	xa_for_each(&rnicp->qps, index, qhp) {
-		if (ring_db)
-			ring_doorbell(qhp->rhp->rdev.ctrl_qp.doorbell,
-					qhp->wq.qpid);
-		cxio_enable_wq_db(&qhp->wq);
-	}
-	xa_unlock_irq(&rnicp->qps);
-}
-
-static void iwch_db_drop_task(struct work_struct *work)
-{
-	struct iwch_dev *rnicp = container_of(work, struct iwch_dev,
-					      db_drop_task.work);
-	enable_dbs(rnicp, 1);
-}
-
-static void rnic_init(struct iwch_dev *rnicp)
-{
-	pr_debug("%s iwch_dev %p\n", __func__,  rnicp);
-	xa_init_flags(&rnicp->cqs, XA_FLAGS_LOCK_IRQ);
-	xa_init_flags(&rnicp->qps, XA_FLAGS_LOCK_IRQ);
-	xa_init_flags(&rnicp->mrs, XA_FLAGS_LOCK_IRQ);
-	INIT_DELAYED_WORK(&rnicp->db_drop_task, iwch_db_drop_task);
-
-	rnicp->attr.max_qps = T3_MAX_NUM_QP - 32;
-	rnicp->attr.max_wrs = T3_MAX_QP_DEPTH;
-	rnicp->attr.max_sge_per_wr = T3_MAX_SGE;
-	rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE;
-	rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1;
-	rnicp->attr.max_cqes_per_cq = T3_MAX_CQ_DEPTH;
-	rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev);
-	rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE;
-	rnicp->attr.max_pds = T3_MAX_NUM_PD - 1;
-	rnicp->attr.mem_pgsizes_bitmask = T3_PAGESIZE_MASK;
-	rnicp->attr.max_mr_size = T3_MAX_MR_SIZE;
-	rnicp->attr.can_resize_wq = 0;
-	rnicp->attr.max_rdma_reads_per_qp = 8;
-	rnicp->attr.max_rdma_read_resources =
-	    rnicp->attr.max_rdma_reads_per_qp * rnicp->attr.max_qps;
-	rnicp->attr.max_rdma_read_qp_depth = 8;	/* IRD */
-	rnicp->attr.max_rdma_read_depth =
-	    rnicp->attr.max_rdma_read_qp_depth * rnicp->attr.max_qps;
-	rnicp->attr.rq_overflow_handled = 0;
-	rnicp->attr.can_modify_ird = 0;
-	rnicp->attr.can_modify_ord = 0;
-	rnicp->attr.max_mem_windows = rnicp->attr.max_mem_regs - 1;
-	rnicp->attr.stag0_value = 1;
-	rnicp->attr.zbva_support = 1;
-	rnicp->attr.local_invalidate_fence = 1;
-	rnicp->attr.cq_overflow_detection = 1;
-	return;
-}
-
-static void open_rnic_dev(struct t3cdev *tdev)
-{
-	struct iwch_dev *rnicp;
-
-	pr_debug("%s t3cdev %p\n", __func__,  tdev);
-	pr_info_once("Chelsio T3 RDMA Driver - version %s\n", DRV_VERSION);
-	rnicp = ib_alloc_device(iwch_dev, ibdev);
-	if (!rnicp) {
-		pr_err("Cannot allocate ib device\n");
-		return;
-	}
-	rnicp->rdev.ulp = rnicp;
-	rnicp->rdev.t3cdev_p = tdev;
-
-	mutex_lock(&dev_mutex);
-
-	if (cxio_rdev_open(&rnicp->rdev)) {
-		mutex_unlock(&dev_mutex);
-		pr_err("Unable to open CXIO rdev\n");
-		ib_dealloc_device(&rnicp->ibdev);
-		return;
-	}
-
-	rnic_init(rnicp);
-
-	list_add_tail(&rnicp->entry, &dev_list);
-	mutex_unlock(&dev_mutex);
-
-	if (iwch_register_device(rnicp)) {
-		pr_err("Unable to register device\n");
-		close_rnic_dev(tdev);
-	}
-	pr_info("Initialized device %s\n",
-		pci_name(rnicp->rdev.rnic_info.pdev));
-	return;
-}
-
-static void close_rnic_dev(struct t3cdev *tdev)
-{
-	struct iwch_dev *dev, *tmp;
-	pr_debug("%s t3cdev %p\n", __func__,  tdev);
-	mutex_lock(&dev_mutex);
-	list_for_each_entry_safe(dev, tmp, &dev_list, entry) {
-		if (dev->rdev.t3cdev_p == tdev) {
-			dev->rdev.flags = CXIO_ERROR_FATAL;
-			synchronize_net();
-			cancel_delayed_work_sync(&dev->db_drop_task);
-			list_del(&dev->entry);
-			iwch_unregister_device(dev);
-			cxio_rdev_close(&dev->rdev);
-			WARN_ON(!xa_empty(&dev->cqs));
-			WARN_ON(!xa_empty(&dev->qps));
-			WARN_ON(!xa_empty(&dev->mrs));
-			ib_dealloc_device(&dev->ibdev);
-			break;
-		}
-	}
-	mutex_unlock(&dev_mutex);
-}
-
-static void iwch_event_handler(struct t3cdev *tdev, u32 evt, u32 port_id)
-{
-	struct cxio_rdev *rdev = tdev->ulp;
-	struct iwch_dev *rnicp;
-	struct ib_event event;
-	u32 portnum = port_id + 1;
-	int dispatch = 0;
-
-	if (!rdev)
-		return;
-	rnicp = rdev_to_iwch_dev(rdev);
-	switch (evt) {
-	case OFFLOAD_STATUS_DOWN: {
-		rdev->flags = CXIO_ERROR_FATAL;
-		synchronize_net();
-		event.event  = IB_EVENT_DEVICE_FATAL;
-		dispatch = 1;
-		break;
-		}
-	case OFFLOAD_PORT_DOWN: {
-		event.event  = IB_EVENT_PORT_ERR;
-		dispatch = 1;
-		break;
-		}
-	case OFFLOAD_PORT_UP: {
-		event.event  = IB_EVENT_PORT_ACTIVE;
-		dispatch = 1;
-		break;
-		}
-	case OFFLOAD_DB_FULL: {
-		disable_dbs(rnicp);
-		break;
-		}
-	case OFFLOAD_DB_EMPTY: {
-		enable_dbs(rnicp, 1);
-		break;
-		}
-	case OFFLOAD_DB_DROP: {
-		unsigned long delay = 1000;
-		unsigned short r;
-
-		disable_dbs(rnicp);
-		get_random_bytes(&r, 2);
-		delay += r & 1023;
-
-		/*
-		 * delay is between 1000-2023 usecs.
-		 */
-		schedule_delayed_work(&rnicp->db_drop_task,
-			usecs_to_jiffies(delay));
-		break;
-		}
-	}
-
-	if (dispatch) {
-		event.device = &rnicp->ibdev;
-		event.element.port_num = portnum;
-		ib_dispatch_event(&event);
-	}
-
-	return;
-}
-
-static int __init iwch_init_module(void)
-{
-	int err;
-
-	err = cxio_hal_init();
-	if (err)
-		return err;
-	err = iwch_cm_init();
-	if (err)
-		return err;
-	cxio_register_ev_cb(iwch_ev_dispatch);
-	cxgb3_register_client(&t3c_client);
-	return 0;
-}
-
-static void __exit iwch_exit_module(void)
-{
-	cxgb3_unregister_client(&t3c_client);
-	cxio_unregister_ev_cb(iwch_ev_dispatch);
-	iwch_cm_term();
-	cxio_hal_exit();
-}
-
-module_init(iwch_init_module);
-module_exit(iwch_exit_module);
diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h
deleted file mode 100644
index 310a937..0000000
--- a/drivers/infiniband/hw/cxgb3/iwch.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __IWCH_H__
-#define __IWCH_H__
-
-#include <linux/mutex.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/xarray.h>
-#include <linux/workqueue.h>
-
-#include <rdma/ib_verbs.h>
-
-#include "cxio_hal.h"
-#include "cxgb3_offload.h"
-
-struct iwch_pd;
-struct iwch_cq;
-struct iwch_qp;
-struct iwch_mr;
-
-struct iwch_rnic_attributes {
-	u32 max_qps;
-	u32 max_wrs;				/* Max for any SQ/RQ */
-	u32 max_sge_per_wr;
-	u32 max_sge_per_rdma_write_wr;	/* for RDMA Write WR */
-	u32 max_cqs;
-	u32 max_cqes_per_cq;
-	u32 max_mem_regs;
-	u32 max_phys_buf_entries;		/* for phys buf list */
-	u32 max_pds;
-
-	/*
-	 * The memory page sizes supported by this RNIC.
-	 * Bit position i in bitmap indicates page of
-	 * size (4k)^i.  Phys block list mode unsupported.
-	 */
-	u32 mem_pgsizes_bitmask;
-	u64 max_mr_size;
-	u8 can_resize_wq;
-
-	/*
-	 * The maximum number of RDMA Reads that can be outstanding
-	 * per QP with this RNIC as the target.
-	 */
-	u32 max_rdma_reads_per_qp;
-
-	/*
-	 * The maximum number of resources used for RDMA Reads
-	 * by this RNIC with this RNIC as the target.
-	 */
-	u32 max_rdma_read_resources;
-
-	/*
-	 * The max depth per QP for initiation of RDMA Read
-	 * by this RNIC.
-	 */
-	u32 max_rdma_read_qp_depth;
-
-	/*
-	 * The maximum depth for initiation of RDMA Read
-	 * operations by this RNIC on all QPs
-	 */
-	u32 max_rdma_read_depth;
-	u8 rq_overflow_handled;
-	u32 can_modify_ird;
-	u32 can_modify_ord;
-	u32 max_mem_windows;
-	u32 stag0_value;
-	u8 zbva_support;
-	u8 local_invalidate_fence;
-	u32 cq_overflow_detection;
-};
-
-struct iwch_dev {
-	struct ib_device ibdev;
-	struct cxio_rdev rdev;
-	u32 device_cap_flags;
-	struct iwch_rnic_attributes attr;
-	struct xarray cqs;
-	struct xarray qps;
-	struct xarray mrs;
-	struct list_head entry;
-	struct delayed_work db_drop_task;
-};
-
-static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev)
-{
-	return container_of(ibdev, struct iwch_dev, ibdev);
-}
-
-static inline struct iwch_dev *rdev_to_iwch_dev(struct cxio_rdev *rdev)
-{
-	return container_of(rdev, struct iwch_dev, rdev);
-}
-
-static inline int t3b_device(const struct iwch_dev *rhp)
-{
-	return rhp->rdev.t3cdev_p->type == T3B;
-}
-
-static inline int t3a_device(const struct iwch_dev *rhp)
-{
-	return rhp->rdev.t3cdev_p->type == T3A;
-}
-
-static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid)
-{
-	return xa_load(&rhp->cqs, cqid);
-}
-
-static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid)
-{
-	return xa_load(&rhp->qps, qpid);
-}
-
-static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid)
-{
-	return xa_load(&rhp->mrs, mmid);
-}
-
-extern struct cxgb3_client t3c_client;
-extern cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS];
-extern void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb);
-
-#endif
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
deleted file mode 100644
index 0bca72c..0000000
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ /dev/null
@@ -1,2258 +0,0 @@
-/*
- * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/module.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-#include <linux/workqueue.h>
-#include <linux/skbuff.h>
-#include <linux/timer.h>
-#include <linux/notifier.h>
-#include <linux/inetdevice.h>
-
-#include <net/neighbour.h>
-#include <net/netevent.h>
-#include <net/route.h>
-
-#include "tcb.h"
-#include "cxgb3_offload.h"
-#include "iwch.h"
-#include "iwch_provider.h"
-#include "iwch_cm.h"
-
-static char *states[] = {
-	"idle",
-	"listen",
-	"connecting",
-	"mpa_wait_req",
-	"mpa_req_sent",
-	"mpa_req_rcvd",
-	"mpa_rep_sent",
-	"fpdu_mode",
-	"aborting",
-	"closing",
-	"moribund",
-	"dead",
-	NULL,
-};
-
-int peer2peer = 0;
-module_param(peer2peer, int, 0644);
-MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=0)");
-
-static int ep_timeout_secs = 60;
-module_param(ep_timeout_secs, int, 0644);
-MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
-				   "in seconds (default=60)");
-
-static int mpa_rev = 1;
-module_param(mpa_rev, int, 0644);
-MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, "
-		 "1 is spec compliant. (default=1)");
-
-static int markers_enabled = 0;
-module_param(markers_enabled, int, 0644);
-MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)");
-
-static int crc_enabled = 1;
-module_param(crc_enabled, int, 0644);
-MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)");
-
-static int rcv_win = 256 * 1024;
-module_param(rcv_win, int, 0644);
-MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256)");
-
-static int snd_win = 32 * 1024;
-module_param(snd_win, int, 0644);
-MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)");
-
-static unsigned int nocong = 0;
-module_param(nocong, uint, 0644);
-MODULE_PARM_DESC(nocong, "Turn off congestion control (default=0)");
-
-static unsigned int cong_flavor = 1;
-module_param(cong_flavor, uint, 0644);
-MODULE_PARM_DESC(cong_flavor, "TCP Congestion control flavor (default=1)");
-
-static struct workqueue_struct *workq;
-
-static struct sk_buff_head rxq;
-
-static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp);
-static void ep_timeout(struct timer_list *t);
-static void connect_reply_upcall(struct iwch_ep *ep, int status);
-
-static void start_ep_timer(struct iwch_ep *ep)
-{
-	pr_debug("%s ep %p\n", __func__, ep);
-	if (timer_pending(&ep->timer)) {
-		pr_debug("%s stopped / restarted timer ep %p\n", __func__, ep);
-		del_timer_sync(&ep->timer);
-	} else
-		get_ep(&ep->com);
-	ep->timer.expires = jiffies + ep_timeout_secs * HZ;
-	add_timer(&ep->timer);
-}
-
-static void stop_ep_timer(struct iwch_ep *ep)
-{
-	pr_debug("%s ep %p\n", __func__, ep);
-	if (!timer_pending(&ep->timer)) {
-		WARN(1, "%s timer stopped when its not running!  ep %p state %u\n",
-			__func__, ep, ep->com.state);
-		return;
-	}
-	del_timer_sync(&ep->timer);
-	put_ep(&ep->com);
-}
-
-static int iwch_l2t_send(struct t3cdev *tdev, struct sk_buff *skb, struct l2t_entry *l2e)
-{
-	int	error = 0;
-	struct cxio_rdev *rdev;
-
-	rdev = (struct cxio_rdev *)tdev->ulp;
-	if (cxio_fatal_error(rdev)) {
-		kfree_skb(skb);
-		return -EIO;
-	}
-	error = l2t_send(tdev, skb, l2e);
-	if (error < 0)
-		kfree_skb(skb);
-	return error < 0 ? error : 0;
-}
-
-int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb)
-{
-	int	error = 0;
-	struct cxio_rdev *rdev;
-
-	rdev = (struct cxio_rdev *)tdev->ulp;
-	if (cxio_fatal_error(rdev)) {
-		kfree_skb(skb);
-		return -EIO;
-	}
-	error = cxgb3_ofld_send(tdev, skb);
-	if (error < 0)
-		kfree_skb(skb);
-	return error < 0 ? error : 0;
-}
-
-static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb)
-{
-	struct cpl_tid_release *req;
-
-	skb = get_skb(skb, sizeof(*req), GFP_KERNEL);
-	if (!skb)
-		return;
-	req = skb_put(skb, sizeof(*req));
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, hwtid));
-	skb->priority = CPL_PRIORITY_SETUP;
-	iwch_cxgb3_ofld_send(tdev, skb);
-	return;
-}
-
-int iwch_quiesce_tid(struct iwch_ep *ep)
-{
-	struct cpl_set_tcb_field *req;
-	struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
-
-	if (!skb)
-		return -ENOMEM;
-	req = skb_put(skb, sizeof(*req));
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
-	req->reply = 0;
-	req->cpu_idx = 0;
-	req->word = htons(W_TCB_RX_QUIESCE);
-	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
-	req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE);
-
-	skb->priority = CPL_PRIORITY_DATA;
-	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
-}
-
-int iwch_resume_tid(struct iwch_ep *ep)
-{
-	struct cpl_set_tcb_field *req;
-	struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
-
-	if (!skb)
-		return -ENOMEM;
-	req = skb_put(skb, sizeof(*req));
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
-	req->reply = 0;
-	req->cpu_idx = 0;
-	req->word = htons(W_TCB_RX_QUIESCE);
-	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
-	req->val = 0;
-
-	skb->priority = CPL_PRIORITY_DATA;
-	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
-}
-
-static void set_emss(struct iwch_ep *ep, u16 opt)
-{
-	pr_debug("%s ep %p opt %u\n", __func__, ep, opt);
-	ep->emss = T3C_DATA(ep->com.tdev)->mtus[G_TCPOPT_MSS(opt)] - 40;
-	if (G_TCPOPT_TSTAMP(opt))
-		ep->emss -= 12;
-	if (ep->emss < 128)
-		ep->emss = 128;
-	pr_debug("emss=%d\n", ep->emss);
-}
-
-static enum iwch_ep_state state_read(struct iwch_ep_common *epc)
-{
-	unsigned long flags;
-	enum iwch_ep_state state;
-
-	spin_lock_irqsave(&epc->lock, flags);
-	state = epc->state;
-	spin_unlock_irqrestore(&epc->lock, flags);
-	return state;
-}
-
-static void __state_set(struct iwch_ep_common *epc, enum iwch_ep_state new)
-{
-	epc->state = new;
-}
-
-static void state_set(struct iwch_ep_common *epc, enum iwch_ep_state new)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&epc->lock, flags);
-	pr_debug("%s - %s -> %s\n", __func__, states[epc->state], states[new]);
-	__state_set(epc, new);
-	spin_unlock_irqrestore(&epc->lock, flags);
-	return;
-}
-
-static void *alloc_ep(int size, gfp_t gfp)
-{
-	struct iwch_ep_common *epc;
-
-	epc = kzalloc(size, gfp);
-	if (epc) {
-		kref_init(&epc->kref);
-		spin_lock_init(&epc->lock);
-		init_waitqueue_head(&epc->waitq);
-	}
-	pr_debug("%s alloc ep %p\n", __func__, epc);
-	return epc;
-}
-
-void __free_ep(struct kref *kref)
-{
-	struct iwch_ep *ep;
-	ep = container_of(container_of(kref, struct iwch_ep_common, kref),
-			  struct iwch_ep, com);
-	pr_debug("%s ep %p state %s\n",
-		 __func__, ep, states[state_read(&ep->com)]);
-	if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) {
-		cxgb3_remove_tid(ep->com.tdev, (void *)ep, ep->hwtid);
-		dst_release(ep->dst);
-		l2t_release(ep->com.tdev, ep->l2t);
-	}
-	kfree(ep);
-}
-
-static void release_ep_resources(struct iwch_ep *ep)
-{
-	pr_debug("%s ep %p tid %d\n", __func__, ep, ep->hwtid);
-	set_bit(RELEASE_RESOURCES, &ep->com.flags);
-	put_ep(&ep->com);
-}
-
-static int status2errno(int status)
-{
-	switch (status) {
-	case CPL_ERR_NONE:
-		return 0;
-	case CPL_ERR_CONN_RESET:
-		return -ECONNRESET;
-	case CPL_ERR_ARP_MISS:
-		return -EHOSTUNREACH;
-	case CPL_ERR_CONN_TIMEDOUT:
-		return -ETIMEDOUT;
-	case CPL_ERR_TCAM_FULL:
-		return -ENOMEM;
-	case CPL_ERR_CONN_EXIST:
-		return -EADDRINUSE;
-	default:
-		return -EIO;
-	}
-}
-
-/*
- * Try and reuse skbs already allocated...
- */
-static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp)
-{
-	if (skb && !skb_is_nonlinear(skb) && !skb_cloned(skb)) {
-		skb_trim(skb, 0);
-		skb_get(skb);
-	} else {
-		skb = alloc_skb(len, gfp);
-	}
-	return skb;
-}
-
-static struct rtable *find_route(struct t3cdev *dev, __be32 local_ip,
-				 __be32 peer_ip, __be16 local_port,
-				 __be16 peer_port, u8 tos)
-{
-	struct rtable *rt;
-	struct flowi4 fl4;
-
-	rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip,
-				   peer_port, local_port, IPPROTO_TCP,
-				   tos, 0);
-	if (IS_ERR(rt))
-		return NULL;
-	return rt;
-}
-
-static unsigned int find_best_mtu(const struct t3c_data *d, unsigned short mtu)
-{
-	int i = 0;
-
-	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
-		++i;
-	return i;
-}
-
-static void arp_failure_discard(struct t3cdev *dev, struct sk_buff *skb)
-{
-	pr_debug("%s t3cdev %p\n", __func__, dev);
-	kfree_skb(skb);
-}
-
-/*
- * Handle an ARP failure for an active open.
- */
-static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
-{
-	pr_err("ARP failure during connect\n");
-	kfree_skb(skb);
-}
-
-/*
- * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
- * and send it along.
- */
-static void abort_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
-{
-	struct cpl_abort_req *req = cplhdr(skb);
-
-	pr_debug("%s t3cdev %p\n", __func__, dev);
-	req->cmd = CPL_ABORT_NO_RST;
-	iwch_cxgb3_ofld_send(dev, skb);
-}
-
-static int send_halfclose(struct iwch_ep *ep, gfp_t gfp)
-{
-	struct cpl_close_con_req *req;
-	struct sk_buff *skb;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-	skb = get_skb(NULL, sizeof(*req), gfp);
-	if (!skb) {
-		pr_err("%s - failed to alloc skb\n", __func__);
-		return -ENOMEM;
-	}
-	skb->priority = CPL_PRIORITY_DATA;
-	set_arp_failure_handler(skb, arp_failure_discard);
-	req = skb_put(skb, sizeof(*req));
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
-	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, ep->hwtid));
-	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
-}
-
-static int send_abort(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp)
-{
-	struct cpl_abort_req *req;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-	skb = get_skb(skb, sizeof(*req), gfp);
-	if (!skb) {
-		pr_err("%s - failed to alloc skb\n", __func__);
-		return -ENOMEM;
-	}
-	skb->priority = CPL_PRIORITY_DATA;
-	set_arp_failure_handler(skb, abort_arp_failure);
-	req = skb_put_zero(skb, sizeof(*req));
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
-	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid));
-	req->cmd = CPL_ABORT_SEND_RST;
-	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
-}
-
-static int send_connect(struct iwch_ep *ep)
-{
-	struct cpl_act_open_req *req;
-	struct sk_buff *skb;
-	u32 opt0h, opt0l, opt2;
-	unsigned int mtu_idx;
-	int wscale;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-
-	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
-	if (!skb) {
-		pr_err("%s - failed to alloc skb\n", __func__);
-		return -ENOMEM;
-	}
-	mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst));
-	wscale = compute_wscale(rcv_win);
-	opt0h = V_NAGLE(0) |
-	    V_NO_CONG(nocong) |
-	    V_KEEP_ALIVE(1) |
-	    F_TCAM_BYPASS |
-	    V_WND_SCALE(wscale) |
-	    V_MSS_IDX(mtu_idx) |
-	    V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx);
-	opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10);
-	opt2 = F_RX_COALESCE_VALID | V_RX_COALESCE(0) | V_FLAVORS_VALID(1) |
-	       V_CONG_CONTROL_FLAVOR(cong_flavor);
-	skb->priority = CPL_PRIORITY_SETUP;
-	set_arp_failure_handler(skb, act_open_req_arp_failure);
-
-	req = skb_put(skb, sizeof(*req));
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, ep->atid));
-	req->local_port = ep->com.local_addr.sin_port;
-	req->peer_port = ep->com.remote_addr.sin_port;
-	req->local_ip = ep->com.local_addr.sin_addr.s_addr;
-	req->peer_ip = ep->com.remote_addr.sin_addr.s_addr;
-	req->opt0h = htonl(opt0h);
-	req->opt0l = htonl(opt0l);
-	req->params = 0;
-	req->opt2 = htonl(opt2);
-	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
-}
-
-static void send_mpa_req(struct iwch_ep *ep, struct sk_buff *skb)
-{
-	int mpalen;
-	struct tx_data_wr *req;
-	struct mpa_message *mpa;
-	int len;
-
-	pr_debug("%s ep %p pd_len %d\n", __func__, ep, ep->plen);
-
-	BUG_ON(skb_cloned(skb));
-
-	mpalen = sizeof(*mpa) + ep->plen;
-	if (skb->data + mpalen + sizeof(*req) > skb_end_pointer(skb)) {
-		kfree_skb(skb);
-		skb=alloc_skb(mpalen + sizeof(*req), GFP_KERNEL);
-		if (!skb) {
-			connect_reply_upcall(ep, -ENOMEM);
-			return;
-		}
-	}
-	skb_trim(skb, 0);
-	skb_reserve(skb, sizeof(*req));
-	skb_put(skb, mpalen);
-	skb->priority = CPL_PRIORITY_DATA;
-	mpa = (struct mpa_message *) skb->data;
-	memset(mpa, 0, sizeof(*mpa));
-	memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
-	mpa->flags = (crc_enabled ? MPA_CRC : 0) |
-		     (markers_enabled ? MPA_MARKERS : 0);
-	mpa->private_data_size = htons(ep->plen);
-	mpa->revision = mpa_rev;
-
-	if (ep->plen)
-		memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen);
-
-	/*
-	 * Reference the mpa skb.  This ensures the data area
-	 * will remain in memory until the hw acks the tx.
-	 * Function tx_ack() will deref it.
-	 */
-	skb_get(skb);
-	set_arp_failure_handler(skb, arp_failure_discard);
-	skb_reset_transport_header(skb);
-	len = skb->len;
-	req = skb_push(skb, sizeof(*req));
-	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL);
-	req->wr_lo = htonl(V_WR_TID(ep->hwtid));
-	req->len = htonl(len);
-	req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
-			   V_TX_SNDBUF(snd_win>>15));
-	req->flags = htonl(F_TX_INIT);
-	req->sndseq = htonl(ep->snd_seq);
-	BUG_ON(ep->mpa_skb);
-	ep->mpa_skb = skb;
-	iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
-	start_ep_timer(ep);
-	state_set(&ep->com, MPA_REQ_SENT);
-	return;
-}
-
-static int send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen)
-{
-	int mpalen;
-	struct tx_data_wr *req;
-	struct mpa_message *mpa;
-	struct sk_buff *skb;
-
-	pr_debug("%s ep %p plen %d\n", __func__, ep, plen);
-
-	mpalen = sizeof(*mpa) + plen;
-
-	skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL);
-	if (!skb) {
-		pr_err("%s - cannot alloc skb!\n", __func__);
-		return -ENOMEM;
-	}
-	skb_reserve(skb, sizeof(*req));
-	mpa = skb_put(skb, mpalen);
-	memset(mpa, 0, sizeof(*mpa));
-	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
-	mpa->flags = MPA_REJECT;
-	mpa->revision = mpa_rev;
-	mpa->private_data_size = htons(plen);
-	if (plen)
-		memcpy(mpa->private_data, pdata, plen);
-
-	/*
-	 * Reference the mpa skb again.  This ensures the data area
-	 * will remain in memory until the hw acks the tx.
-	 * Function tx_ack() will deref it.
-	 */
-	skb_get(skb);
-	skb->priority = CPL_PRIORITY_DATA;
-	set_arp_failure_handler(skb, arp_failure_discard);
-	skb_reset_transport_header(skb);
-	req = skb_push(skb, sizeof(*req));
-	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL);
-	req->wr_lo = htonl(V_WR_TID(ep->hwtid));
-	req->len = htonl(mpalen);
-	req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
-			   V_TX_SNDBUF(snd_win>>15));
-	req->flags = htonl(F_TX_INIT);
-	req->sndseq = htonl(ep->snd_seq);
-	BUG_ON(ep->mpa_skb);
-	ep->mpa_skb = skb;
-	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
-}
-
-static int send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen)
-{
-	int mpalen;
-	struct tx_data_wr *req;
-	struct mpa_message *mpa;
-	int len;
-	struct sk_buff *skb;
-
-	pr_debug("%s ep %p plen %d\n", __func__, ep, plen);
-
-	mpalen = sizeof(*mpa) + plen;
-
-	skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL);
-	if (!skb) {
-		pr_err("%s - cannot alloc skb!\n", __func__);
-		return -ENOMEM;
-	}
-	skb->priority = CPL_PRIORITY_DATA;
-	skb_reserve(skb, sizeof(*req));
-	mpa = skb_put(skb, mpalen);
-	memset(mpa, 0, sizeof(*mpa));
-	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
-	mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) |
-		     (markers_enabled ? MPA_MARKERS : 0);
-	mpa->revision = mpa_rev;
-	mpa->private_data_size = htons(plen);
-	if (plen)
-		memcpy(mpa->private_data, pdata, plen);
-
-	/*
-	 * Reference the mpa skb.  This ensures the data area
-	 * will remain in memory until the hw acks the tx.
-	 * Function tx_ack() will deref it.
-	 */
-	skb_get(skb);
-	set_arp_failure_handler(skb, arp_failure_discard);
-	skb_reset_transport_header(skb);
-	len = skb->len;
-	req = skb_push(skb, sizeof(*req));
-	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL);
-	req->wr_lo = htonl(V_WR_TID(ep->hwtid));
-	req->len = htonl(len);
-	req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
-			   V_TX_SNDBUF(snd_win>>15));
-	req->flags = htonl(F_TX_INIT);
-	req->sndseq = htonl(ep->snd_seq);
-	ep->mpa_skb = skb;
-	state_set(&ep->com, MPA_REP_SENT);
-	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
-}
-
-static int act_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
-{
-	struct iwch_ep *ep = ctx;
-	struct cpl_act_establish *req = cplhdr(skb);
-	unsigned int tid = GET_TID(req);
-
-	pr_debug("%s ep %p tid %d\n", __func__, ep, tid);
-
-	dst_confirm(ep->dst);
-
-	/* setup the hwtid for this connection */
-	ep->hwtid = tid;
-	cxgb3_insert_tid(ep->com.tdev, &t3c_client, ep, tid);
-
-	ep->snd_seq = ntohl(req->snd_isn);
-	ep->rcv_seq = ntohl(req->rcv_isn);
-
-	set_emss(ep, ntohs(req->tcp_opt));
-
-	/* dealloc the atid */
-	cxgb3_free_atid(ep->com.tdev, ep->atid);
-
-	/* start MPA negotiation */
-	send_mpa_req(ep, skb);
-
-	return 0;
-}
-
-static void abort_connection(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp)
-{
-	pr_debug("%s ep %p\n", __FILE__, ep);
-	state_set(&ep->com, ABORTING);
-	send_abort(ep, skb, gfp);
-}
-
-static void close_complete_upcall(struct iwch_ep *ep)
-{
-	struct iw_cm_event event;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-	memset(&event, 0, sizeof(event));
-	event.event = IW_CM_EVENT_CLOSE;
-	if (ep->com.cm_id) {
-		pr_debug("close complete delivered ep %p cm_id %p tid %d\n",
-			 ep, ep->com.cm_id, ep->hwtid);
-		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
-		ep->com.cm_id->rem_ref(ep->com.cm_id);
-		ep->com.cm_id = NULL;
-		ep->com.qp = NULL;
-	}
-}
-
-static void peer_close_upcall(struct iwch_ep *ep)
-{
-	struct iw_cm_event event;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-	memset(&event, 0, sizeof(event));
-	event.event = IW_CM_EVENT_DISCONNECT;
-	if (ep->com.cm_id) {
-		pr_debug("peer close delivered ep %p cm_id %p tid %d\n",
-			 ep, ep->com.cm_id, ep->hwtid);
-		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
-	}
-}
-
-static void peer_abort_upcall(struct iwch_ep *ep)
-{
-	struct iw_cm_event event;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-	memset(&event, 0, sizeof(event));
-	event.event = IW_CM_EVENT_CLOSE;
-	event.status = -ECONNRESET;
-	if (ep->com.cm_id) {
-		pr_debug("abort delivered ep %p cm_id %p tid %d\n", ep,
-			 ep->com.cm_id, ep->hwtid);
-		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
-		ep->com.cm_id->rem_ref(ep->com.cm_id);
-		ep->com.cm_id = NULL;
-		ep->com.qp = NULL;
-	}
-}
-
-static void connect_reply_upcall(struct iwch_ep *ep, int status)
-{
-	struct iw_cm_event event;
-
-	pr_debug("%s ep %p status %d\n", __func__, ep, status);
-	memset(&event, 0, sizeof(event));
-	event.event = IW_CM_EVENT_CONNECT_REPLY;
-	event.status = status;
-	memcpy(&event.local_addr, &ep->com.local_addr,
-	       sizeof(ep->com.local_addr));
-	memcpy(&event.remote_addr, &ep->com.remote_addr,
-	       sizeof(ep->com.remote_addr));
-
-	if ((status == 0) || (status == -ECONNREFUSED)) {
-		event.private_data_len = ep->plen;
-		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
-	}
-	if (ep->com.cm_id) {
-		pr_debug("%s ep %p tid %d status %d\n", __func__, ep,
-			 ep->hwtid, status);
-		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
-	}
-	if (status < 0) {
-		ep->com.cm_id->rem_ref(ep->com.cm_id);
-		ep->com.cm_id = NULL;
-		ep->com.qp = NULL;
-	}
-}
-
-static void connect_request_upcall(struct iwch_ep *ep)
-{
-	struct iw_cm_event event;
-
-	pr_debug("%s ep %p tid %d\n", __func__, ep, ep->hwtid);
-	memset(&event, 0, sizeof(event));
-	event.event = IW_CM_EVENT_CONNECT_REQUEST;
-	memcpy(&event.local_addr, &ep->com.local_addr,
-	       sizeof(ep->com.local_addr));
-	memcpy(&event.remote_addr, &ep->com.remote_addr,
-	       sizeof(ep->com.local_addr));
-	event.private_data_len = ep->plen;
-	event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
-	event.provider_data = ep;
-	/*
-	 * Until ird/ord negotiation via MPAv2 support is added, send max
-	 * supported values
-	 */
-	event.ird = event.ord = 8;
-	if (state_read(&ep->parent_ep->com) != DEAD) {
-		get_ep(&ep->com);
-		ep->parent_ep->com.cm_id->event_handler(
-						ep->parent_ep->com.cm_id,
-						&event);
-	}
-	put_ep(&ep->parent_ep->com);
-	ep->parent_ep = NULL;
-}
-
-static void established_upcall(struct iwch_ep *ep)
-{
-	struct iw_cm_event event;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-	memset(&event, 0, sizeof(event));
-	event.event = IW_CM_EVENT_ESTABLISHED;
-	/*
-	 * Until ird/ord negotiation via MPAv2 support is added, send max
-	 * supported values
-	 */
-	event.ird = event.ord = 8;
-	if (ep->com.cm_id) {
-		pr_debug("%s ep %p tid %d\n", __func__, ep, ep->hwtid);
-		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
-	}
-}
-
-static int update_rx_credits(struct iwch_ep *ep, u32 credits)
-{
-	struct cpl_rx_data_ack *req;
-	struct sk_buff *skb;
-
-	pr_debug("%s ep %p credits %u\n", __func__, ep, credits);
-	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
-	if (!skb) {
-		pr_err("update_rx_credits - cannot alloc skb!\n");
-		return 0;
-	}
-
-	req = skb_put(skb, sizeof(*req));
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, ep->hwtid));
-	req->credit_dack = htonl(V_RX_CREDITS(credits) | V_RX_FORCE_ACK(1));
-	skb->priority = CPL_PRIORITY_ACK;
-	iwch_cxgb3_ofld_send(ep->com.tdev, skb);
-	return credits;
-}
-
-static void process_mpa_reply(struct iwch_ep *ep, struct sk_buff *skb)
-{
-	struct mpa_message *mpa;
-	u16 plen;
-	struct iwch_qp_attributes attrs;
-	enum iwch_qp_attr_mask mask;
-	int err;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-
-	/*
-	 * Stop mpa timer.  If it expired, then the state has
-	 * changed and we bail since ep_timeout already aborted
-	 * the connection.
-	 */
-	stop_ep_timer(ep);
-	if (state_read(&ep->com) != MPA_REQ_SENT)
-		return;
-
-	/*
-	 * If we get more than the supported amount of private data
-	 * then we must fail this connection.
-	 */
-	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
-		err = -EINVAL;
-		goto err;
-	}
-
-	/*
-	 * copy the new data into our accumulation buffer.
-	 */
-	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
-				  skb->len);
-	ep->mpa_pkt_len += skb->len;
-
-	/*
-	 * if we don't even have the mpa message, then bail.
-	 */
-	if (ep->mpa_pkt_len < sizeof(*mpa))
-		return;
-	mpa = (struct mpa_message *) ep->mpa_pkt;
-
-	/* Validate MPA header. */
-	if (mpa->revision != mpa_rev) {
-		err = -EPROTO;
-		goto err;
-	}
-	if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) {
-		err = -EPROTO;
-		goto err;
-	}
-
-	plen = ntohs(mpa->private_data_size);
-
-	/*
-	 * Fail if there's too much private data.
-	 */
-	if (plen > MPA_MAX_PRIVATE_DATA) {
-		err = -EPROTO;
-		goto err;
-	}
-
-	/*
-	 * If plen does not account for pkt size
-	 */
-	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
-		err = -EPROTO;
-		goto err;
-	}
-
-	ep->plen = (u8) plen;
-
-	/*
-	 * If we don't have all the pdata yet, then bail.
-	 * We'll continue process when more data arrives.
-	 */
-	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
-		return;
-
-	if (mpa->flags & MPA_REJECT) {
-		err = -ECONNREFUSED;
-		goto err;
-	}
-
-	/*
-	 * If we get here we have accumulated the entire mpa
-	 * start reply message including private data. And
-	 * the MPA header is valid.
-	 */
-	state_set(&ep->com, FPDU_MODE);
-	ep->mpa_attr.initiator = 1;
-	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
-	ep->mpa_attr.recv_marker_enabled = markers_enabled;
-	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
-	ep->mpa_attr.version = mpa_rev;
-	pr_debug("%s - crc_enabled=%d, recv_marker_enabled=%d, xmit_marker_enabled=%d, version=%d\n",
-		 __func__,
-		 ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
-		 ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
-
-	attrs.mpa_attr = ep->mpa_attr;
-	attrs.max_ird = ep->ird;
-	attrs.max_ord = ep->ord;
-	attrs.llp_stream_handle = ep;
-	attrs.next_state = IWCH_QP_STATE_RTS;
-
-	mask = IWCH_QP_ATTR_NEXT_STATE |
-	    IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR |
-	    IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD;
-
-	/* bind QP and TID with INIT_WR */
-	err = iwch_modify_qp(ep->com.qp->rhp,
-			     ep->com.qp, mask, &attrs, 1);
-	if (err)
-		goto err;
-
-	if (peer2peer && iwch_rqes_posted(ep->com.qp) == 0) {
-		iwch_post_zb_read(ep);
-	}
-
-	goto out;
-err:
-	abort_connection(ep, skb, GFP_KERNEL);
-out:
-	connect_reply_upcall(ep, err);
-	return;
-}
-
-static void process_mpa_request(struct iwch_ep *ep, struct sk_buff *skb)
-{
-	struct mpa_message *mpa;
-	u16 plen;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-
-	/*
-	 * Stop mpa timer.  If it expired, then the state has
-	 * changed and we bail since ep_timeout already aborted
-	 * the connection.
-	 */
-	stop_ep_timer(ep);
-	if (state_read(&ep->com) != MPA_REQ_WAIT)
-		return;
-
-	/*
-	 * If we get more than the supported amount of private data
-	 * then we must fail this connection.
-	 */
-	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
-		abort_connection(ep, skb, GFP_KERNEL);
-		return;
-	}
-
-	pr_debug("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
-
-	/*
-	 * Copy the new data into our accumulation buffer.
-	 */
-	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
-				  skb->len);
-	ep->mpa_pkt_len += skb->len;
-
-	/*
-	 * If we don't even have the mpa message, then bail.
-	 * We'll continue process when more data arrives.
-	 */
-	if (ep->mpa_pkt_len < sizeof(*mpa))
-		return;
-	pr_debug("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
-	mpa = (struct mpa_message *) ep->mpa_pkt;
-
-	/*
-	 * Validate MPA Header.
-	 */
-	if (mpa->revision != mpa_rev) {
-		abort_connection(ep, skb, GFP_KERNEL);
-		return;
-	}
-
-	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
-		abort_connection(ep, skb, GFP_KERNEL);
-		return;
-	}
-
-	plen = ntohs(mpa->private_data_size);
-
-	/*
-	 * Fail if there's too much private data.
-	 */
-	if (plen > MPA_MAX_PRIVATE_DATA) {
-		abort_connection(ep, skb, GFP_KERNEL);
-		return;
-	}
-
-	/*
-	 * If plen does not account for pkt size
-	 */
-	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
-		abort_connection(ep, skb, GFP_KERNEL);
-		return;
-	}
-	ep->plen = (u8) plen;
-
-	/*
-	 * If we don't have all the pdata yet, then bail.
-	 */
-	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
-		return;
-
-	/*
-	 * If we get here we have accumulated the entire mpa
-	 * start reply message including private data.
-	 */
-	ep->mpa_attr.initiator = 0;
-	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
-	ep->mpa_attr.recv_marker_enabled = markers_enabled;
-	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
-	ep->mpa_attr.version = mpa_rev;
-	pr_debug("%s - crc_enabled=%d, recv_marker_enabled=%d, xmit_marker_enabled=%d, version=%d\n",
-		 __func__,
-		 ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
-		 ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
-
-	state_set(&ep->com, MPA_REQ_RCVD);
-
-	/* drive upcall */
-	connect_request_upcall(ep);
-	return;
-}
-
-static int rx_data(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
-{
-	struct iwch_ep *ep = ctx;
-	struct cpl_rx_data *hdr = cplhdr(skb);
-	unsigned int dlen = ntohs(hdr->len);
-
-	pr_debug("%s ep %p dlen %u\n", __func__, ep, dlen);
-
-	skb_pull(skb, sizeof(*hdr));
-	skb_trim(skb, dlen);
-
-	ep->rcv_seq += dlen;
-	BUG_ON(ep->rcv_seq != (ntohl(hdr->seq) + dlen));
-
-	switch (state_read(&ep->com)) {
-	case MPA_REQ_SENT:
-		process_mpa_reply(ep, skb);
-		break;
-	case MPA_REQ_WAIT:
-		process_mpa_request(ep, skb);
-		break;
-	case MPA_REP_SENT:
-		break;
-	default:
-		pr_err("%s Unexpected streaming data. ep %p state %d tid %d\n",
-		       __func__, ep, state_read(&ep->com), ep->hwtid);
-
-		/*
-		 * The ep will timeout and inform the ULP of the failure.
-		 * See ep_timeout().
-		 */
-		break;
-	}
-
-	/* update RX credits */
-	update_rx_credits(ep, dlen);
-
-	return CPL_RET_BUF_DONE;
-}
-
-/*
- * Upcall from the adapter indicating data has been transmitted.
- * For us its just the single MPA request or reply.  We can now free
- * the skb holding the mpa message.
- */
-static int tx_ack(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
-{
-	struct iwch_ep *ep = ctx;
-	struct cpl_wr_ack *hdr = cplhdr(skb);
-	unsigned int credits = ntohs(hdr->credits);
-	unsigned long flags;
-	int post_zb = 0;
-
-	pr_debug("%s ep %p credits %u\n", __func__, ep, credits);
-
-	if (credits == 0) {
-		pr_debug("%s 0 credit ack  ep %p state %u\n",
-			 __func__, ep, state_read(&ep->com));
-		return CPL_RET_BUF_DONE;
-	}
-
-	spin_lock_irqsave(&ep->com.lock, flags);
-	BUG_ON(credits != 1);
-	dst_confirm(ep->dst);
-	if (!ep->mpa_skb) {
-		pr_debug("%s rdma_init wr_ack ep %p state %u\n",
-			 __func__, ep, ep->com.state);
-		if (ep->mpa_attr.initiator) {
-			pr_debug("%s initiator ep %p state %u\n",
-				 __func__, ep, ep->com.state);
-			if (peer2peer && ep->com.state == FPDU_MODE)
-				post_zb = 1;
-		} else {
-			pr_debug("%s responder ep %p state %u\n",
-				 __func__, ep, ep->com.state);
-			if (ep->com.state == MPA_REQ_RCVD) {
-				ep->com.rpl_done = 1;
-				wake_up(&ep->com.waitq);
-			}
-		}
-	} else {
-		pr_debug("%s lsm ack ep %p state %u freeing skb\n",
-			 __func__, ep, ep->com.state);
-		kfree_skb(ep->mpa_skb);
-		ep->mpa_skb = NULL;
-	}
-	spin_unlock_irqrestore(&ep->com.lock, flags);
-	if (post_zb)
-		iwch_post_zb_read(ep);
-	return CPL_RET_BUF_DONE;
-}
-
-static int abort_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
-{
-	struct iwch_ep *ep = ctx;
-	unsigned long flags;
-	int release = 0;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-	BUG_ON(!ep);
-
-	/*
-	 * We get 2 abort replies from the HW.  The first one must
-	 * be ignored except for scribbling that we need one more.
-	 */
-	if (!test_and_set_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags)) {
-		return CPL_RET_BUF_DONE;
-	}
-
-	spin_lock_irqsave(&ep->com.lock, flags);
-	switch (ep->com.state) {
-	case ABORTING:
-		close_complete_upcall(ep);
-		__state_set(&ep->com, DEAD);
-		release = 1;
-		break;
-	default:
-		pr_err("%s ep %p state %d\n", __func__, ep, ep->com.state);
-		break;
-	}
-	spin_unlock_irqrestore(&ep->com.lock, flags);
-
-	if (release)
-		release_ep_resources(ep);
-	return CPL_RET_BUF_DONE;
-}
-
-/*
- * Return whether a failed active open has allocated a TID
- */
-static inline int act_open_has_tid(int status)
-{
-	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
-	       status != CPL_ERR_ARP_MISS;
-}
-
-static int act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
-{
-	struct iwch_ep *ep = ctx;
-	struct cpl_act_open_rpl *rpl = cplhdr(skb);
-
-	pr_debug("%s ep %p status %u errno %d\n", __func__, ep, rpl->status,
-		 status2errno(rpl->status));
-	connect_reply_upcall(ep, status2errno(rpl->status));
-	state_set(&ep->com, DEAD);
-	if (ep->com.tdev->type != T3A && act_open_has_tid(rpl->status))
-		release_tid(ep->com.tdev, GET_TID(rpl), NULL);
-	cxgb3_free_atid(ep->com.tdev, ep->atid);
-	dst_release(ep->dst);
-	l2t_release(ep->com.tdev, ep->l2t);
-	put_ep(&ep->com);
-	return CPL_RET_BUF_DONE;
-}
-
-static int listen_start(struct iwch_listen_ep *ep)
-{
-	struct sk_buff *skb;
-	struct cpl_pass_open_req *req;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
-	if (!skb) {
-		pr_err("t3c_listen_start failed to alloc skb!\n");
-		return -ENOMEM;
-	}
-
-	req = skb_put(skb, sizeof(*req));
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, ep->stid));
-	req->local_port = ep->com.local_addr.sin_port;
-	req->local_ip = ep->com.local_addr.sin_addr.s_addr;
-	req->peer_port = 0;
-	req->peer_ip = 0;
-	req->peer_netmask = 0;
-	req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS);
-	req->opt0l = htonl(V_RCV_BUFSIZ(rcv_win>>10));
-	req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK));
-
-	skb->priority = 1;
-	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
-}
-
-static int pass_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
-{
-	struct iwch_listen_ep *ep = ctx;
-	struct cpl_pass_open_rpl *rpl = cplhdr(skb);
-
-	pr_debug("%s ep %p status %d error %d\n", __func__, ep,
-		 rpl->status, status2errno(rpl->status));
-	ep->com.rpl_err = status2errno(rpl->status);
-	ep->com.rpl_done = 1;
-	wake_up(&ep->com.waitq);
-
-	return CPL_RET_BUF_DONE;
-}
-
-static int listen_stop(struct iwch_listen_ep *ep)
-{
-	struct sk_buff *skb;
-	struct cpl_close_listserv_req *req;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
-	if (!skb) {
-		pr_err("%s - failed to alloc skb\n", __func__);
-		return -ENOMEM;
-	}
-	req = skb_put(skb, sizeof(*req));
-	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	req->cpu_idx = 0;
-	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, ep->stid));
-	skb->priority = 1;
-	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
-}
-
-static int close_listsrv_rpl(struct t3cdev *tdev, struct sk_buff *skb,
-			     void *ctx)
-{
-	struct iwch_listen_ep *ep = ctx;
-	struct cpl_close_listserv_rpl *rpl = cplhdr(skb);
-
-	pr_debug("%s ep %p\n", __func__, ep);
-	ep->com.rpl_err = status2errno(rpl->status);
-	ep->com.rpl_done = 1;
-	wake_up(&ep->com.waitq);
-	return CPL_RET_BUF_DONE;
-}
-
-static void accept_cr(struct iwch_ep *ep, __be32 peer_ip, struct sk_buff *skb)
-{
-	struct cpl_pass_accept_rpl *rpl;
-	unsigned int mtu_idx;
-	u32 opt0h, opt0l, opt2;
-	int wscale;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-	BUG_ON(skb_cloned(skb));
-	skb_trim(skb, sizeof(*rpl));
-	skb_get(skb);
-	mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst));
-	wscale = compute_wscale(rcv_win);
-	opt0h = V_NAGLE(0) |
-	    V_NO_CONG(nocong) |
-	    V_KEEP_ALIVE(1) |
-	    F_TCAM_BYPASS |
-	    V_WND_SCALE(wscale) |
-	    V_MSS_IDX(mtu_idx) |
-	    V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx);
-	opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10);
-	opt2 = F_RX_COALESCE_VALID | V_RX_COALESCE(0) | V_FLAVORS_VALID(1) |
-	       V_CONG_CONTROL_FLAVOR(cong_flavor);
-
-	rpl = cplhdr(skb);
-	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, ep->hwtid));
-	rpl->peer_ip = peer_ip;
-	rpl->opt0h = htonl(opt0h);
-	rpl->opt0l_status = htonl(opt0l | CPL_PASS_OPEN_ACCEPT);
-	rpl->opt2 = htonl(opt2);
-	rpl->rsvd = rpl->opt2;	/* workaround for HW bug */
-	skb->priority = CPL_PRIORITY_SETUP;
-	iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
-
-	return;
-}
-
-static void reject_cr(struct t3cdev *tdev, u32 hwtid, __be32 peer_ip,
-		      struct sk_buff *skb)
-{
-	pr_debug("%s t3cdev %p tid %u peer_ip %x\n", __func__, tdev, hwtid,
-		 peer_ip);
-	BUG_ON(skb_cloned(skb));
-	skb_trim(skb, sizeof(struct cpl_tid_release));
-	skb_get(skb);
-
-	if (tdev->type != T3A)
-		release_tid(tdev, hwtid, skb);
-	else {
-		struct cpl_pass_accept_rpl *rpl;
-
-		rpl = cplhdr(skb);
-		skb->priority = CPL_PRIORITY_SETUP;
-		rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-		OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL,
-						      hwtid));
-		rpl->peer_ip = peer_ip;
-		rpl->opt0h = htonl(F_TCAM_BYPASS);
-		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
-		rpl->opt2 = 0;
-		rpl->rsvd = rpl->opt2;
-		iwch_cxgb3_ofld_send(tdev, skb);
-	}
-}
-
-static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
-{
-	struct iwch_ep *child_ep, *parent_ep = ctx;
-	struct cpl_pass_accept_req *req = cplhdr(skb);
-	unsigned int hwtid = GET_TID(req);
-	struct dst_entry *dst;
-	struct l2t_entry *l2t;
-	struct rtable *rt;
-	struct iff_mac tim;
-
-	pr_debug("%s parent ep %p tid %u\n", __func__, parent_ep, hwtid);
-
-	if (state_read(&parent_ep->com) != LISTEN) {
-		pr_err("%s - listening ep not in LISTEN\n", __func__);
-		goto reject;
-	}
-
-	/*
-	 * Find the netdev for this connection request.
-	 */
-	tim.mac_addr = req->dst_mac;
-	tim.vlan_tag = ntohs(req->vlan_tag);
-	if (tdev->ctl(tdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
-		pr_err("%s bad dst mac %pM\n", __func__, req->dst_mac);
-		goto reject;
-	}
-
-	/* Find output route */
-	rt = find_route(tdev,
-			req->local_ip,
-			req->peer_ip,
-			req->local_port,
-			req->peer_port, G_PASS_OPEN_TOS(ntohl(req->tos_tid)));
-	if (!rt) {
-		pr_err("%s - failed to find dst entry!\n", __func__);
-		goto reject;
-	}
-	dst = &rt->dst;
-	l2t = t3_l2t_get(tdev, dst, NULL, &req->peer_ip);
-	if (!l2t) {
-		pr_err("%s - failed to allocate l2t entry!\n", __func__);
-		dst_release(dst);
-		goto reject;
-	}
-	child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL);
-	if (!child_ep) {
-		pr_err("%s - failed to allocate ep entry!\n", __func__);
-		l2t_release(tdev, l2t);
-		dst_release(dst);
-		goto reject;
-	}
-	state_set(&child_ep->com, CONNECTING);
-	child_ep->com.tdev = tdev;
-	child_ep->com.cm_id = NULL;
-	child_ep->com.local_addr.sin_family = AF_INET;
-	child_ep->com.local_addr.sin_port = req->local_port;
-	child_ep->com.local_addr.sin_addr.s_addr = req->local_ip;
-	child_ep->com.remote_addr.sin_family = AF_INET;
-	child_ep->com.remote_addr.sin_port = req->peer_port;
-	child_ep->com.remote_addr.sin_addr.s_addr = req->peer_ip;
-	get_ep(&parent_ep->com);
-	child_ep->parent_ep = parent_ep;
-	child_ep->tos = G_PASS_OPEN_TOS(ntohl(req->tos_tid));
-	child_ep->l2t = l2t;
-	child_ep->dst = dst;
-	child_ep->hwtid = hwtid;
-	timer_setup(&child_ep->timer, ep_timeout, 0);
-	cxgb3_insert_tid(tdev, &t3c_client, child_ep, hwtid);
-	accept_cr(child_ep, req->peer_ip, skb);
-	goto out;
-reject:
-	reject_cr(tdev, hwtid, req->peer_ip, skb);
-out:
-	return CPL_RET_BUF_DONE;
-}
-
-static int pass_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
-{
-	struct iwch_ep *ep = ctx;
-	struct cpl_pass_establish *req = cplhdr(skb);
-
-	pr_debug("%s ep %p\n", __func__, ep);
-	ep->snd_seq = ntohl(req->snd_isn);
-	ep->rcv_seq = ntohl(req->rcv_isn);
-
-	set_emss(ep, ntohs(req->tcp_opt));
-
-	dst_confirm(ep->dst);
-	state_set(&ep->com, MPA_REQ_WAIT);
-	start_ep_timer(ep);
-
-	return CPL_RET_BUF_DONE;
-}
-
-static int peer_close(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
-{
-	struct iwch_ep *ep = ctx;
-	struct iwch_qp_attributes attrs;
-	unsigned long flags;
-	int disconnect = 1;
-	int release = 0;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-	dst_confirm(ep->dst);
-
-	spin_lock_irqsave(&ep->com.lock, flags);
-	switch (ep->com.state) {
-	case MPA_REQ_WAIT:
-		__state_set(&ep->com, CLOSING);
-		break;
-	case MPA_REQ_SENT:
-		__state_set(&ep->com, CLOSING);
-		connect_reply_upcall(ep, -ECONNRESET);
-		break;
-	case MPA_REQ_RCVD:
-
-		/*
-		 * We're gonna mark this puppy DEAD, but keep
-		 * the reference on it until the ULP accepts or
-		 * rejects the CR. Also wake up anyone waiting
-		 * in rdma connection migration (see iwch_accept_cr()).
-		 */
-		__state_set(&ep->com, CLOSING);
-		ep->com.rpl_done = 1;
-		ep->com.rpl_err = -ECONNRESET;
-		pr_debug("waking up ep %p\n", ep);
-		wake_up(&ep->com.waitq);
-		break;
-	case MPA_REP_SENT:
-		__state_set(&ep->com, CLOSING);
-		ep->com.rpl_done = 1;
-		ep->com.rpl_err = -ECONNRESET;
-		pr_debug("waking up ep %p\n", ep);
-		wake_up(&ep->com.waitq);
-		break;
-	case FPDU_MODE:
-		start_ep_timer(ep);
-		__state_set(&ep->com, CLOSING);
-		attrs.next_state = IWCH_QP_STATE_CLOSING;
-		iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
-			       IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
-		peer_close_upcall(ep);
-		break;
-	case ABORTING:
-		disconnect = 0;
-		break;
-	case CLOSING:
-		__state_set(&ep->com, MORIBUND);
-		disconnect = 0;
-		break;
-	case MORIBUND:
-		stop_ep_timer(ep);
-		if (ep->com.cm_id && ep->com.qp) {
-			attrs.next_state = IWCH_QP_STATE_IDLE;
-			iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
-				       IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
-		}
-		close_complete_upcall(ep);
-		__state_set(&ep->com, DEAD);
-		release = 1;
-		disconnect = 0;
-		break;
-	case DEAD:
-		disconnect = 0;
-		break;
-	default:
-		BUG_ON(1);
-	}
-	spin_unlock_irqrestore(&ep->com.lock, flags);
-	if (disconnect)
-		iwch_ep_disconnect(ep, 0, GFP_KERNEL);
-	if (release)
-		release_ep_resources(ep);
-	return CPL_RET_BUF_DONE;
-}
-
-/*
- * Returns whether an ABORT_REQ_RSS message is a negative advice.
- */
-static int is_neg_adv_abort(unsigned int status)
-{
-	return status == CPL_ERR_RTX_NEG_ADVICE ||
-	       status == CPL_ERR_PERSIST_NEG_ADVICE;
-}
-
-static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
-{
-	struct cpl_abort_req_rss *req = cplhdr(skb);
-	struct iwch_ep *ep = ctx;
-	struct cpl_abort_rpl *rpl;
-	struct sk_buff *rpl_skb;
-	struct iwch_qp_attributes attrs;
-	int ret;
-	int release = 0;
-	unsigned long flags;
-
-	if (is_neg_adv_abort(req->status)) {
-		pr_debug("%s neg_adv_abort ep %p tid %d\n", __func__, ep,
-			 ep->hwtid);
-		t3_l2t_send_event(ep->com.tdev, ep->l2t);
-		return CPL_RET_BUF_DONE;
-	}
-
-	/*
-	 * We get 2 peer aborts from the HW.  The first one must
-	 * be ignored except for scribbling that we need one more.
-	 */
-	if (!test_and_set_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags)) {
-		return CPL_RET_BUF_DONE;
-	}
-
-	spin_lock_irqsave(&ep->com.lock, flags);
-	pr_debug("%s ep %p state %u\n", __func__, ep, ep->com.state);
-	switch (ep->com.state) {
-	case CONNECTING:
-		break;
-	case MPA_REQ_WAIT:
-		stop_ep_timer(ep);
-		break;
-	case MPA_REQ_SENT:
-		stop_ep_timer(ep);
-		connect_reply_upcall(ep, -ECONNRESET);
-		break;
-	case MPA_REP_SENT:
-		ep->com.rpl_done = 1;
-		ep->com.rpl_err = -ECONNRESET;
-		pr_debug("waking up ep %p\n", ep);
-		wake_up(&ep->com.waitq);
-		break;
-	case MPA_REQ_RCVD:
-
-		/*
-		 * We're gonna mark this puppy DEAD, but keep
-		 * the reference on it until the ULP accepts or
-		 * rejects the CR. Also wake up anyone waiting
-		 * in rdma connection migration (see iwch_accept_cr()).
-		 */
-		ep->com.rpl_done = 1;
-		ep->com.rpl_err = -ECONNRESET;
-		pr_debug("waking up ep %p\n", ep);
-		wake_up(&ep->com.waitq);
-		break;
-	case MORIBUND:
-	case CLOSING:
-		stop_ep_timer(ep);
-		/*FALLTHROUGH*/
-	case FPDU_MODE:
-		if (ep->com.cm_id && ep->com.qp) {
-			attrs.next_state = IWCH_QP_STATE_ERROR;
-			ret = iwch_modify_qp(ep->com.qp->rhp,
-				     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
-				     &attrs, 1);
-			if (ret)
-				pr_err("%s - qp <- error failed!\n", __func__);
-		}
-		peer_abort_upcall(ep);
-		break;
-	case ABORTING:
-		break;
-	case DEAD:
-		pr_debug("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__);
-		spin_unlock_irqrestore(&ep->com.lock, flags);
-		return CPL_RET_BUF_DONE;
-	default:
-		BUG_ON(1);
-		break;
-	}
-	dst_confirm(ep->dst);
-	if (ep->com.state != ABORTING) {
-		__state_set(&ep->com, DEAD);
-		release = 1;
-	}
-	spin_unlock_irqrestore(&ep->com.lock, flags);
-
-	rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL);
-	if (!rpl_skb) {
-		pr_err("%s - cannot allocate skb!\n", __func__);
-		release = 1;
-		goto out;
-	}
-	rpl_skb->priority = CPL_PRIORITY_DATA;
-	rpl = skb_put(rpl_skb, sizeof(*rpl));
-	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
-	rpl->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
-	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid));
-	rpl->cmd = CPL_ABORT_NO_RST;
-	iwch_cxgb3_ofld_send(ep->com.tdev, rpl_skb);
-out:
-	if (release)
-		release_ep_resources(ep);
-	return CPL_RET_BUF_DONE;
-}
-
-static int close_con_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
-{
-	struct iwch_ep *ep = ctx;
-	struct iwch_qp_attributes attrs;
-	unsigned long flags;
-	int release = 0;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-	BUG_ON(!ep);
-
-	/* The cm_id may be null if we failed to connect */
-	spin_lock_irqsave(&ep->com.lock, flags);
-	switch (ep->com.state) {
-	case CLOSING:
-		__state_set(&ep->com, MORIBUND);
-		break;
-	case MORIBUND:
-		stop_ep_timer(ep);
-		if ((ep->com.cm_id) && (ep->com.qp)) {
-			attrs.next_state = IWCH_QP_STATE_IDLE;
-			iwch_modify_qp(ep->com.qp->rhp,
-					     ep->com.qp,
-					     IWCH_QP_ATTR_NEXT_STATE,
-					     &attrs, 1);
-		}
-		close_complete_upcall(ep);
-		__state_set(&ep->com, DEAD);
-		release = 1;
-		break;
-	case ABORTING:
-	case DEAD:
-		break;
-	default:
-		BUG_ON(1);
-		break;
-	}
-	spin_unlock_irqrestore(&ep->com.lock, flags);
-	if (release)
-		release_ep_resources(ep);
-	return CPL_RET_BUF_DONE;
-}
-
-/*
- * T3A does 3 things when a TERM is received:
- * 1) send up a CPL_RDMA_TERMINATE message with the TERM packet
- * 2) generate an async event on the QP with the TERMINATE opcode
- * 3) post a TERMINATE opcode cqe into the associated CQ.
- *
- * For (1), we save the message in the qp for later consumer consumption.
- * For (2), we move the QP into TERMINATE, post a QP event and disconnect.
- * For (3), we toss the CQE in cxio_poll_cq().
- *
- * terminate() handles case (1)...
- */
-static int terminate(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
-{
-	struct iwch_ep *ep = ctx;
-
-	if (state_read(&ep->com) != FPDU_MODE)
-		return CPL_RET_BUF_DONE;
-
-	pr_debug("%s ep %p\n", __func__, ep);
-	skb_pull(skb, sizeof(struct cpl_rdma_terminate));
-	pr_debug("%s saving %d bytes of term msg\n", __func__, skb->len);
-	skb_copy_from_linear_data(skb, ep->com.qp->attr.terminate_buffer,
-				  skb->len);
-	ep->com.qp->attr.terminate_msg_len = skb->len;
-	ep->com.qp->attr.is_terminate_local = 0;
-	return CPL_RET_BUF_DONE;
-}
-
-static int ec_status(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
-{
-	struct cpl_rdma_ec_status *rep = cplhdr(skb);
-	struct iwch_ep *ep = ctx;
-
-	pr_debug("%s ep %p tid %u status %d\n", __func__, ep, ep->hwtid,
-		 rep->status);
-	if (rep->status) {
-		struct iwch_qp_attributes attrs;
-
-		pr_err("%s BAD CLOSE - Aborting tid %u\n",
-		       __func__, ep->hwtid);
-		stop_ep_timer(ep);
-		attrs.next_state = IWCH_QP_STATE_ERROR;
-		iwch_modify_qp(ep->com.qp->rhp,
-			       ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
-			       &attrs, 1);
-		abort_connection(ep, NULL, GFP_KERNEL);
-	}
-	return CPL_RET_BUF_DONE;
-}
-
-static void ep_timeout(struct timer_list *t)
-{
-	struct iwch_ep *ep = from_timer(ep, t, timer);
-	struct iwch_qp_attributes attrs;
-	unsigned long flags;
-	int abort = 1;
-
-	spin_lock_irqsave(&ep->com.lock, flags);
-	pr_debug("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid,
-		 ep->com.state);
-	switch (ep->com.state) {
-	case MPA_REQ_SENT:
-		__state_set(&ep->com, ABORTING);
-		connect_reply_upcall(ep, -ETIMEDOUT);
-		break;
-	case MPA_REQ_WAIT:
-		__state_set(&ep->com, ABORTING);
-		break;
-	case CLOSING:
-	case MORIBUND:
-		if (ep->com.cm_id && ep->com.qp) {
-			attrs.next_state = IWCH_QP_STATE_ERROR;
-			iwch_modify_qp(ep->com.qp->rhp,
-				     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
-				     &attrs, 1);
-		}
-		__state_set(&ep->com, ABORTING);
-		break;
-	default:
-		WARN(1, "%s unexpected state ep %p state %u\n",
-			__func__, ep, ep->com.state);
-		abort = 0;
-	}
-	spin_unlock_irqrestore(&ep->com.lock, flags);
-	if (abort)
-		abort_connection(ep, NULL, GFP_ATOMIC);
-	put_ep(&ep->com);
-}
-
-int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
-{
-	struct iwch_ep *ep = to_ep(cm_id);
-
-	pr_debug("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
-
-	if (state_read(&ep->com) == DEAD) {
-		put_ep(&ep->com);
-		return -ECONNRESET;
-	}
-	BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD);
-	if (mpa_rev == 0)
-		abort_connection(ep, NULL, GFP_KERNEL);
-	else {
-		send_mpa_reject(ep, pdata, pdata_len);
-		iwch_ep_disconnect(ep, 0, GFP_KERNEL);
-	}
-	put_ep(&ep->com);
-	return 0;
-}
-
-int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
-{
-	int err;
-	struct iwch_qp_attributes attrs;
-	enum iwch_qp_attr_mask mask;
-	struct iwch_ep *ep = to_ep(cm_id);
-	struct iwch_dev *h = to_iwch_dev(cm_id->device);
-	struct iwch_qp *qp = get_qhp(h, conn_param->qpn);
-
-	pr_debug("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
-	if (state_read(&ep->com) == DEAD) {
-		err = -ECONNRESET;
-		goto err;
-	}
-
-	BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD);
-	BUG_ON(!qp);
-
-	if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) ||
-	    (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) {
-		abort_connection(ep, NULL, GFP_KERNEL);
-		err = -EINVAL;
-		goto err;
-	}
-
-	cm_id->add_ref(cm_id);
-	ep->com.cm_id = cm_id;
-	ep->com.qp = qp;
-
-	ep->ird = conn_param->ird;
-	ep->ord = conn_param->ord;
-
-	if (peer2peer && ep->ird == 0)
-		ep->ird = 1;
-
-	pr_debug("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord);
-
-	/* bind QP to EP and move to RTS */
-	attrs.mpa_attr = ep->mpa_attr;
-	attrs.max_ird = ep->ird;
-	attrs.max_ord = ep->ord;
-	attrs.llp_stream_handle = ep;
-	attrs.next_state = IWCH_QP_STATE_RTS;
-
-	/* bind QP and TID with INIT_WR */
-	mask = IWCH_QP_ATTR_NEXT_STATE |
-			     IWCH_QP_ATTR_LLP_STREAM_HANDLE |
-			     IWCH_QP_ATTR_MPA_ATTR |
-			     IWCH_QP_ATTR_MAX_IRD |
-			     IWCH_QP_ATTR_MAX_ORD;
-
-	err = iwch_modify_qp(ep->com.qp->rhp,
-			     ep->com.qp, mask, &attrs, 1);
-	if (err)
-		goto err1;
-
-	/* if needed, wait for wr_ack */
-	if (iwch_rqes_posted(qp)) {
-		wait_event(ep->com.waitq, ep->com.rpl_done);
-		err = ep->com.rpl_err;
-		if (err)
-			goto err1;
-	}
-
-	err = send_mpa_reply(ep, conn_param->private_data,
-			     conn_param->private_data_len);
-	if (err)
-		goto err1;
-
-
-	state_set(&ep->com, FPDU_MODE);
-	established_upcall(ep);
-	put_ep(&ep->com);
-	return 0;
-err1:
-	ep->com.cm_id = NULL;
-	ep->com.qp = NULL;
-	cm_id->rem_ref(cm_id);
-err:
-	put_ep(&ep->com);
-	return err;
-}
-
-static int is_loopback_dst(struct iw_cm_id *cm_id)
-{
-	struct net_device *dev;
-	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
-
-	dev = ip_dev_find(&init_net, raddr->sin_addr.s_addr);
-	if (!dev)
-		return 0;
-	dev_put(dev);
-	return 1;
-}
-
-int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
-{
-	struct iwch_dev *h = to_iwch_dev(cm_id->device);
-	struct iwch_ep *ep;
-	struct rtable *rt;
-	int err = 0;
-	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->m_local_addr;
-	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->m_remote_addr;
-
-	if (cm_id->m_remote_addr.ss_family != PF_INET) {
-		err = -ENOSYS;
-		goto out;
-	}
-
-	if (is_loopback_dst(cm_id)) {
-		err = -ENOSYS;
-		goto out;
-	}
-
-	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
-	if (!ep) {
-		pr_err("%s - cannot alloc ep\n", __func__);
-		err = -ENOMEM;
-		goto out;
-	}
-	timer_setup(&ep->timer, ep_timeout, 0);
-	ep->plen = conn_param->private_data_len;
-	if (ep->plen)
-		memcpy(ep->mpa_pkt + sizeof(struct mpa_message),
-		       conn_param->private_data, ep->plen);
-	ep->ird = conn_param->ird;
-	ep->ord = conn_param->ord;
-
-	if (peer2peer && ep->ord == 0)
-		ep->ord = 1;
-
-	ep->com.tdev = h->rdev.t3cdev_p;
-
-	cm_id->add_ref(cm_id);
-	ep->com.cm_id = cm_id;
-	ep->com.qp = get_qhp(h, conn_param->qpn);
-	BUG_ON(!ep->com.qp);
-	pr_debug("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn,
-		 ep->com.qp, cm_id);
-
-	/*
-	 * Allocate an active TID to initiate a TCP connection.
-	 */
-	ep->atid = cxgb3_alloc_atid(h->rdev.t3cdev_p, &t3c_client, ep);
-	if (ep->atid == -1) {
-		pr_err("%s - cannot alloc atid\n", __func__);
-		err = -ENOMEM;
-		goto fail2;
-	}
-
-	/* find a route */
-	rt = find_route(h->rdev.t3cdev_p, laddr->sin_addr.s_addr,
-			raddr->sin_addr.s_addr, laddr->sin_port,
-			raddr->sin_port, IPTOS_LOWDELAY);
-	if (!rt) {
-		pr_err("%s - cannot find route\n", __func__);
-		err = -EHOSTUNREACH;
-		goto fail3;
-	}
-	ep->dst = &rt->dst;
-	ep->l2t = t3_l2t_get(ep->com.tdev, ep->dst, NULL,
-			     &raddr->sin_addr.s_addr);
-	if (!ep->l2t) {
-		pr_err("%s - cannot alloc l2e\n", __func__);
-		err = -ENOMEM;
-		goto fail4;
-	}
-
-	state_set(&ep->com, CONNECTING);
-	ep->tos = IPTOS_LOWDELAY;
-	memcpy(&ep->com.local_addr, &cm_id->m_local_addr,
-	       sizeof(ep->com.local_addr));
-	memcpy(&ep->com.remote_addr, &cm_id->m_remote_addr,
-	       sizeof(ep->com.remote_addr));
-
-	/* send connect request to rnic */
-	err = send_connect(ep);
-	if (!err)
-		goto out;
-
-	l2t_release(h->rdev.t3cdev_p, ep->l2t);
-fail4:
-	dst_release(ep->dst);
-fail3:
-	cxgb3_free_atid(ep->com.tdev, ep->atid);
-fail2:
-	cm_id->rem_ref(cm_id);
-	put_ep(&ep->com);
-out:
-	return err;
-}
-
-int iwch_create_listen(struct iw_cm_id *cm_id, int backlog)
-{
-	int err = 0;
-	struct iwch_dev *h = to_iwch_dev(cm_id->device);
-	struct iwch_listen_ep *ep;
-
-
-	might_sleep();
-
-	if (cm_id->m_local_addr.ss_family != PF_INET) {
-		err = -ENOSYS;
-		goto fail1;
-	}
-
-	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
-	if (!ep) {
-		pr_err("%s - cannot alloc ep\n", __func__);
-		err = -ENOMEM;
-		goto fail1;
-	}
-	pr_debug("%s ep %p\n", __func__, ep);
-	ep->com.tdev = h->rdev.t3cdev_p;
-	cm_id->add_ref(cm_id);
-	ep->com.cm_id = cm_id;
-	ep->backlog = backlog;
-	memcpy(&ep->com.local_addr, &cm_id->m_local_addr,
-	       sizeof(ep->com.local_addr));
-
-	/*
-	 * Allocate a server TID.
-	 */
-	ep->stid = cxgb3_alloc_stid(h->rdev.t3cdev_p, &t3c_client, ep);
-	if (ep->stid == -1) {
-		pr_err("%s - cannot alloc atid\n", __func__);
-		err = -ENOMEM;
-		goto fail2;
-	}
-
-	state_set(&ep->com, LISTEN);
-	err = listen_start(ep);
-	if (err)
-		goto fail3;
-
-	/* wait for pass_open_rpl */
-	wait_event(ep->com.waitq, ep->com.rpl_done);
-	err = ep->com.rpl_err;
-	if (!err) {
-		cm_id->provider_data = ep;
-		goto out;
-	}
-fail3:
-	cxgb3_free_stid(ep->com.tdev, ep->stid);
-fail2:
-	cm_id->rem_ref(cm_id);
-	put_ep(&ep->com);
-fail1:
-out:
-	return err;
-}
-
-int iwch_destroy_listen(struct iw_cm_id *cm_id)
-{
-	int err;
-	struct iwch_listen_ep *ep = to_listen_ep(cm_id);
-
-	pr_debug("%s ep %p\n", __func__, ep);
-
-	might_sleep();
-	state_set(&ep->com, DEAD);
-	ep->com.rpl_done = 0;
-	ep->com.rpl_err = 0;
-	err = listen_stop(ep);
-	if (err)
-		goto done;
-	wait_event(ep->com.waitq, ep->com.rpl_done);
-	cxgb3_free_stid(ep->com.tdev, ep->stid);
-done:
-	err = ep->com.rpl_err;
-	cm_id->rem_ref(cm_id);
-	put_ep(&ep->com);
-	return err;
-}
-
-int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp)
-{
-	int ret=0;
-	unsigned long flags;
-	int close = 0;
-	int fatal = 0;
-	struct t3cdev *tdev;
-	struct cxio_rdev *rdev;
-
-	spin_lock_irqsave(&ep->com.lock, flags);
-
-	pr_debug("%s ep %p state %s, abrupt %d\n", __func__, ep,
-		 states[ep->com.state], abrupt);
-
-	tdev = (struct t3cdev *)ep->com.tdev;
-	rdev = (struct cxio_rdev *)tdev->ulp;
-	if (cxio_fatal_error(rdev)) {
-		fatal = 1;
-		close_complete_upcall(ep);
-		ep->com.state = DEAD;
-	}
-	switch (ep->com.state) {
-	case MPA_REQ_WAIT:
-	case MPA_REQ_SENT:
-	case MPA_REQ_RCVD:
-	case MPA_REP_SENT:
-	case FPDU_MODE:
-		close = 1;
-		if (abrupt)
-			ep->com.state = ABORTING;
-		else {
-			ep->com.state = CLOSING;
-			start_ep_timer(ep);
-		}
-		set_bit(CLOSE_SENT, &ep->com.flags);
-		break;
-	case CLOSING:
-		if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) {
-			close = 1;
-			if (abrupt) {
-				stop_ep_timer(ep);
-				ep->com.state = ABORTING;
-			} else
-				ep->com.state = MORIBUND;
-		}
-		break;
-	case MORIBUND:
-	case ABORTING:
-	case DEAD:
-		pr_debug("%s ignoring disconnect ep %p state %u\n",
-			 __func__, ep, ep->com.state);
-		break;
-	default:
-		BUG();
-		break;
-	}
-
-	spin_unlock_irqrestore(&ep->com.lock, flags);
-	if (close) {
-		if (abrupt)
-			ret = send_abort(ep, NULL, gfp);
-		else
-			ret = send_halfclose(ep, gfp);
-		if (ret)
-			fatal = 1;
-	}
-	if (fatal)
-		release_ep_resources(ep);
-	return ret;
-}
-
-int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new,
-		     struct l2t_entry *l2t)
-{
-	struct iwch_ep *ep = ctx;
-
-	if (ep->dst != old)
-		return 0;
-
-	pr_debug("%s ep %p redirect to dst %p l2t %p\n", __func__, ep, new,
-		 l2t);
-	dst_hold(new);
-	l2t_release(ep->com.tdev, ep->l2t);
-	ep->l2t = l2t;
-	dst_release(old);
-	ep->dst = new;
-	return 1;
-}
-
-/*
- * All the CM events are handled on a work queue to have a safe context.
- * These are the real handlers that are called from the work queue.
- */
-static const cxgb3_cpl_handler_func work_handlers[NUM_CPL_CMDS] = {
-	[CPL_ACT_ESTABLISH]	= act_establish,
-	[CPL_ACT_OPEN_RPL]	= act_open_rpl,
-	[CPL_RX_DATA]		= rx_data,
-	[CPL_TX_DMA_ACK]	= tx_ack,
-	[CPL_ABORT_RPL_RSS]	= abort_rpl,
-	[CPL_ABORT_RPL]		= abort_rpl,
-	[CPL_PASS_OPEN_RPL]	= pass_open_rpl,
-	[CPL_CLOSE_LISTSRV_RPL]	= close_listsrv_rpl,
-	[CPL_PASS_ACCEPT_REQ]	= pass_accept_req,
-	[CPL_PASS_ESTABLISH]	= pass_establish,
-	[CPL_PEER_CLOSE]	= peer_close,
-	[CPL_ABORT_REQ_RSS]	= peer_abort,
-	[CPL_CLOSE_CON_RPL]	= close_con_rpl,
-	[CPL_RDMA_TERMINATE]	= terminate,
-	[CPL_RDMA_EC_STATUS]	= ec_status,
-};
-
-static void process_work(struct work_struct *work)
-{
-	struct sk_buff *skb = NULL;
-	void *ep;
-	struct t3cdev *tdev;
-	int ret;
-
-	while ((skb = skb_dequeue(&rxq))) {
-		ep = *((void **) (skb->cb));
-		tdev = *((struct t3cdev **) (skb->cb + sizeof(void *)));
-		ret = work_handlers[G_OPCODE(ntohl((__force __be32)skb->csum))](tdev, skb, ep);
-		if (ret & CPL_RET_BUF_DONE)
-			kfree_skb(skb);
-
-		/*
-		 * ep was referenced in sched(), and is freed here.
-		 */
-		put_ep((struct iwch_ep_common *)ep);
-	}
-}
-
-static DECLARE_WORK(skb_work, process_work);
-
-static int sched(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
-{
-	struct iwch_ep_common *epc = ctx;
-
-	get_ep(epc);
-
-	/*
-	 * Save ctx and tdev in the skb->cb area.
-	 */
-	*((void **) skb->cb) = ctx;
-	*((struct t3cdev **) (skb->cb + sizeof(void *))) = tdev;
-
-	/*
-	 * Queue the skb and schedule the worker thread.
-	 */
-	skb_queue_tail(&rxq, skb);
-	queue_work(workq, &skb_work);
-	return 0;
-}
-
-static int set_tcb_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
-{
-	struct cpl_set_tcb_rpl *rpl = cplhdr(skb);
-
-	if (rpl->status != CPL_ERR_NONE) {
-		pr_err("Unexpected SET_TCB_RPL status %u for tid %u\n",
-		       rpl->status, GET_TID(rpl));
-	}
-	return CPL_RET_BUF_DONE;
-}
-
-/*
- * All upcalls from the T3 Core go to sched() to schedule the
- * processing on a work queue.
- */
-cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS] = {
-	[CPL_ACT_ESTABLISH]	= sched,
-	[CPL_ACT_OPEN_RPL]	= sched,
-	[CPL_RX_DATA]		= sched,
-	[CPL_TX_DMA_ACK]	= sched,
-	[CPL_ABORT_RPL_RSS]	= sched,
-	[CPL_ABORT_RPL]		= sched,
-	[CPL_PASS_OPEN_RPL]	= sched,
-	[CPL_CLOSE_LISTSRV_RPL]	= sched,
-	[CPL_PASS_ACCEPT_REQ]	= sched,
-	[CPL_PASS_ESTABLISH]	= sched,
-	[CPL_PEER_CLOSE]	= sched,
-	[CPL_CLOSE_CON_RPL]	= sched,
-	[CPL_ABORT_REQ_RSS]	= sched,
-	[CPL_RDMA_TERMINATE]	= sched,
-	[CPL_RDMA_EC_STATUS]	= sched,
-	[CPL_SET_TCB_RPL]	= set_tcb_rpl,
-};
-
-int __init iwch_cm_init(void)
-{
-	skb_queue_head_init(&rxq);
-
-	workq = alloc_ordered_workqueue("iw_cxgb3", WQ_MEM_RECLAIM);
-	if (!workq)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void __exit iwch_cm_term(void)
-{
-	flush_workqueue(workq);
-	destroy_workqueue(workq);
-}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h
deleted file mode 100644
index cc7fe64..0000000
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef _IWCH_CM_H_
-#define _IWCH_CM_H_
-
-#include <linux/inet.h>
-#include <linux/wait.h>
-#include <linux/spinlock.h>
-#include <linux/kref.h>
-
-#include <rdma/ib_verbs.h>
-#include <rdma/iw_cm.h>
-
-#include "cxgb3_offload.h"
-#include "iwch_provider.h"
-
-#define MPA_KEY_REQ "MPA ID Req Frame"
-#define MPA_KEY_REP "MPA ID Rep Frame"
-
-#define MPA_MAX_PRIVATE_DATA	256
-#define MPA_REV		0	/* XXX - amso1100 uses rev 0 ! */
-#define MPA_REJECT		0x20
-#define MPA_CRC			0x40
-#define MPA_MARKERS		0x80
-#define MPA_FLAGS_MASK		0xE0
-
-#define put_ep(ep) {							\
-	pr_debug("put_ep (via %s:%u) ep %p refcnt %d\n",		\
-		 __func__, __LINE__, ep, kref_read(&((ep)->kref)));	\
-	WARN_ON(kref_read(&((ep)->kref)) < 1);				\
-	kref_put(&((ep)->kref), __free_ep);				\
-}
-
-#define get_ep(ep) {							\
-	pr_debug("get_ep (via %s:%u) ep %p, refcnt %d\n",		\
-		 __func__, __LINE__, ep, kref_read(&((ep)->kref)));	\
-	kref_get(&((ep)->kref));					\
-}
-
-struct mpa_message {
-	u8 key[16];
-	u8 flags;
-	u8 revision;
-	__be16 private_data_size;
-	u8 private_data[0];
-};
-
-struct terminate_message {
-	u8 layer_etype;
-	u8 ecode;
-	__be16 hdrct_rsvd;
-	u8 len_hdrs[0];
-};
-
-#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28)
-
-enum iwch_layers_types {
-	LAYER_RDMAP		= 0x00,
-	LAYER_DDP		= 0x10,
-	LAYER_MPA		= 0x20,
-	RDMAP_LOCAL_CATA	= 0x00,
-	RDMAP_REMOTE_PROT	= 0x01,
-	RDMAP_REMOTE_OP		= 0x02,
-	DDP_LOCAL_CATA		= 0x00,
-	DDP_TAGGED_ERR		= 0x01,
-	DDP_UNTAGGED_ERR	= 0x02,
-	DDP_LLP			= 0x03
-};
-
-enum iwch_rdma_ecodes {
-	RDMAP_INV_STAG		= 0x00,
-	RDMAP_BASE_BOUNDS	= 0x01,
-	RDMAP_ACC_VIOL		= 0x02,
-	RDMAP_STAG_NOT_ASSOC	= 0x03,
-	RDMAP_TO_WRAP		= 0x04,
-	RDMAP_INV_VERS		= 0x05,
-	RDMAP_INV_OPCODE	= 0x06,
-	RDMAP_STREAM_CATA	= 0x07,
-	RDMAP_GLOBAL_CATA	= 0x08,
-	RDMAP_CANT_INV_STAG	= 0x09,
-	RDMAP_UNSPECIFIED	= 0xff
-};
-
-enum iwch_ddp_ecodes {
-	DDPT_INV_STAG		= 0x00,
-	DDPT_BASE_BOUNDS	= 0x01,
-	DDPT_STAG_NOT_ASSOC	= 0x02,
-	DDPT_TO_WRAP		= 0x03,
-	DDPT_INV_VERS		= 0x04,
-	DDPU_INV_QN		= 0x01,
-	DDPU_INV_MSN_NOBUF	= 0x02,
-	DDPU_INV_MSN_RANGE	= 0x03,
-	DDPU_INV_MO		= 0x04,
-	DDPU_MSG_TOOBIG		= 0x05,
-	DDPU_INV_VERS		= 0x06
-};
-
-enum iwch_mpa_ecodes {
-	MPA_CRC_ERR		= 0x02,
-	MPA_MARKER_ERR		= 0x03
-};
-
-enum iwch_ep_state {
-	IDLE = 0,
-	LISTEN,
-	CONNECTING,
-	MPA_REQ_WAIT,
-	MPA_REQ_SENT,
-	MPA_REQ_RCVD,
-	MPA_REP_SENT,
-	FPDU_MODE,
-	ABORTING,
-	CLOSING,
-	MORIBUND,
-	DEAD,
-};
-
-enum iwch_ep_flags {
-	PEER_ABORT_IN_PROGRESS	= 0,
-	ABORT_REQ_IN_PROGRESS	= 1,
-	RELEASE_RESOURCES	= 2,
-	CLOSE_SENT		= 3,
-};
-
-struct iwch_ep_common {
-	struct iw_cm_id *cm_id;
-	struct iwch_qp *qp;
-	struct t3cdev *tdev;
-	enum iwch_ep_state state;
-	struct kref kref;
-	spinlock_t lock;
-	struct sockaddr_in local_addr;
-	struct sockaddr_in remote_addr;
-	wait_queue_head_t waitq;
-	int rpl_done;
-	int rpl_err;
-	unsigned long flags;
-};
-
-struct iwch_listen_ep {
-	struct iwch_ep_common com;
-	unsigned int stid;
-	int backlog;
-};
-
-struct iwch_ep {
-	struct iwch_ep_common com;
-	struct iwch_ep *parent_ep;
-	struct timer_list timer;
-	unsigned int atid;
-	u32 hwtid;
-	u32 snd_seq;
-	u32 rcv_seq;
-	struct l2t_entry *l2t;
-	struct dst_entry *dst;
-	struct sk_buff *mpa_skb;
-	struct iwch_mpa_attributes mpa_attr;
-	unsigned int mpa_pkt_len;
-	u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA];
-	u8 tos;
-	u16 emss;
-	u16 plen;
-	u32 ird;
-	u32 ord;
-};
-
-static inline struct iwch_ep *to_ep(struct iw_cm_id *cm_id)
-{
-	return cm_id->provider_data;
-}
-
-static inline struct iwch_listen_ep *to_listen_ep(struct iw_cm_id *cm_id)
-{
-	return cm_id->provider_data;
-}
-
-static inline int compute_wscale(int win)
-{
-	int wscale = 0;
-
-	while (wscale < 14 && (65535<<wscale) < win)
-		wscale++;
-	return wscale;
-}
-
-/* CM prototypes */
-
-int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
-int iwch_create_listen(struct iw_cm_id *cm_id, int backlog);
-int iwch_destroy_listen(struct iw_cm_id *cm_id);
-int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len);
-int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
-int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp);
-int iwch_quiesce_tid(struct iwch_ep *ep);
-int iwch_resume_tid(struct iwch_ep *ep);
-void __free_ep(struct kref *kref);
-void iwch_rearp(struct iwch_ep *ep);
-int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, struct l2t_entry *l2t);
-
-int __init iwch_cm_init(void);
-void __exit iwch_cm_term(void);
-extern int peer2peer;
-
-#endif				/* _IWCH_CM_H_ */
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cq.c b/drivers/infiniband/hw/cxgb3/iwch_cq.c
deleted file mode 100644
index a098c01..0000000
--- a/drivers/infiniband/hw/cxgb3/iwch_cq.c
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "iwch_provider.h"
-#include "iwch.h"
-
-static int __iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
-			      struct iwch_qp *qhp, struct ib_wc *wc)
-{
-	struct t3_wq *wq = qhp ? &qhp->wq : NULL;
-	struct t3_cqe cqe;
-	u32 credit = 0;
-	u8 cqe_flushed;
-	u64 cookie;
-	int ret = 1;
-
-	ret = cxio_poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie,
-				   &credit);
-	if (t3a_device(chp->rhp) && credit) {
-		pr_debug("%s updating %d cq credits on id %d\n", __func__,
-			 credit, chp->cq.cqid);
-		cxio_hal_cq_op(&rhp->rdev, &chp->cq, CQ_CREDIT_UPDATE, credit);
-	}
-
-	if (ret) {
-		ret = -EAGAIN;
-		goto out;
-	}
-	ret = 1;
-
-	wc->wr_id = cookie;
-	wc->qp = qhp ? &qhp->ibqp : NULL;
-	wc->vendor_err = CQE_STATUS(cqe);
-	wc->wc_flags = 0;
-
-	pr_debug("%s qpid 0x%x type %d opcode %d status 0x%x wrid hi 0x%x lo 0x%x cookie 0x%llx\n",
-		 __func__,
-		 CQE_QPID(cqe), CQE_TYPE(cqe),
-		 CQE_OPCODE(cqe), CQE_STATUS(cqe), CQE_WRID_HI(cqe),
-		 CQE_WRID_LOW(cqe), (unsigned long long)cookie);
-
-	if (CQE_TYPE(cqe) == 0) {
-		if (!CQE_STATUS(cqe))
-			wc->byte_len = CQE_LEN(cqe);
-		else
-			wc->byte_len = 0;
-		wc->opcode = IB_WC_RECV;
-		if (CQE_OPCODE(cqe) == T3_SEND_WITH_INV ||
-		    CQE_OPCODE(cqe) == T3_SEND_WITH_SE_INV) {
-			wc->ex.invalidate_rkey = CQE_WRID_STAG(cqe);
-			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
-		}
-	} else {
-		switch (CQE_OPCODE(cqe)) {
-		case T3_RDMA_WRITE:
-			wc->opcode = IB_WC_RDMA_WRITE;
-			break;
-		case T3_READ_REQ:
-			wc->opcode = IB_WC_RDMA_READ;
-			wc->byte_len = CQE_LEN(cqe);
-			break;
-		case T3_SEND:
-		case T3_SEND_WITH_SE:
-		case T3_SEND_WITH_INV:
-		case T3_SEND_WITH_SE_INV:
-			wc->opcode = IB_WC_SEND;
-			break;
-		case T3_LOCAL_INV:
-			wc->opcode = IB_WC_LOCAL_INV;
-			break;
-		case T3_FAST_REGISTER:
-			wc->opcode = IB_WC_REG_MR;
-			break;
-		default:
-			pr_err("Unexpected opcode %d in the CQE received for QPID=0x%0x\n",
-			       CQE_OPCODE(cqe), CQE_QPID(cqe));
-			ret = -EINVAL;
-			goto out;
-		}
-	}
-
-	if (cqe_flushed)
-		wc->status = IB_WC_WR_FLUSH_ERR;
-	else {
-
-		switch (CQE_STATUS(cqe)) {
-		case TPT_ERR_SUCCESS:
-			wc->status = IB_WC_SUCCESS;
-			break;
-		case TPT_ERR_STAG:
-			wc->status = IB_WC_LOC_ACCESS_ERR;
-			break;
-		case TPT_ERR_PDID:
-			wc->status = IB_WC_LOC_PROT_ERR;
-			break;
-		case TPT_ERR_QPID:
-		case TPT_ERR_ACCESS:
-			wc->status = IB_WC_LOC_ACCESS_ERR;
-			break;
-		case TPT_ERR_WRAP:
-			wc->status = IB_WC_GENERAL_ERR;
-			break;
-		case TPT_ERR_BOUND:
-			wc->status = IB_WC_LOC_LEN_ERR;
-			break;
-		case TPT_ERR_INVALIDATE_SHARED_MR:
-		case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
-			wc->status = IB_WC_MW_BIND_ERR;
-			break;
-		case TPT_ERR_CRC:
-		case TPT_ERR_MARKER:
-		case TPT_ERR_PDU_LEN_ERR:
-		case TPT_ERR_OUT_OF_RQE:
-		case TPT_ERR_DDP_VERSION:
-		case TPT_ERR_RDMA_VERSION:
-		case TPT_ERR_DDP_QUEUE_NUM:
-		case TPT_ERR_MSN:
-		case TPT_ERR_TBIT:
-		case TPT_ERR_MO:
-		case TPT_ERR_MSN_RANGE:
-		case TPT_ERR_IRD_OVERFLOW:
-		case TPT_ERR_OPCODE:
-			wc->status = IB_WC_FATAL_ERR;
-			break;
-		case TPT_ERR_SWFLUSH:
-			wc->status = IB_WC_WR_FLUSH_ERR;
-			break;
-		default:
-			pr_err("Unexpected cqe_status 0x%x for QPID=0x%0x\n",
-			       CQE_STATUS(cqe), CQE_QPID(cqe));
-			ret = -EINVAL;
-		}
-	}
-out:
-	return ret;
-}
-
-/*
- * Get one cq entry from cxio and map it to openib.
- *
- * Returns:
- *	0			EMPTY;
- *	1			cqe returned
- *	-EAGAIN		caller must try again
- *	any other -errno	fatal error
- */
-static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
-			    struct ib_wc *wc)
-{
-	struct iwch_qp *qhp;
-	struct t3_cqe *rd_cqe;
-	int ret;
-
-	rd_cqe = cxio_next_cqe(&chp->cq);
-
-	if (!rd_cqe)
-		return 0;
-
-	qhp = get_qhp(rhp, CQE_QPID(*rd_cqe));
-	if (qhp) {
-		spin_lock(&qhp->lock);
-		ret = __iwch_poll_cq_one(rhp, chp, qhp, wc);
-		spin_unlock(&qhp->lock);
-	} else {
-		ret = __iwch_poll_cq_one(rhp, chp, NULL, wc);
-	}
-	return ret;
-}
-
-int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
-{
-	struct iwch_dev *rhp;
-	struct iwch_cq *chp;
-	unsigned long flags;
-	int npolled;
-	int err = 0;
-
-	chp = to_iwch_cq(ibcq);
-	rhp = chp->rhp;
-
-	spin_lock_irqsave(&chp->lock, flags);
-	for (npolled = 0; npolled < num_entries; ++npolled) {
-
-		/*
-		 * Because T3 can post CQEs that are _not_ associated
-		 * with a WR, we might have to poll again after removing
-		 * one of these.
-		 */
-		do {
-			err = iwch_poll_cq_one(rhp, chp, wc + npolled);
-		} while (err == -EAGAIN);
-		if (err <= 0)
-			break;
-	}
-	spin_unlock_irqrestore(&chp->lock, flags);
-
-	if (err < 0)
-		return err;
-	else {
-		return npolled;
-	}
-}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_ev.c b/drivers/infiniband/hw/cxgb3/iwch_ev.c
deleted file mode 100644
index 9d356c1..0000000
--- a/drivers/infiniband/hw/cxgb3/iwch_ev.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/gfp.h>
-#include <linux/mman.h>
-#include <net/sock.h>
-#include "iwch_provider.h"
-#include "iwch.h"
-#include "iwch_cm.h"
-#include "cxio_hal.h"
-#include "cxio_wr.h"
-
-static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp,
-			  struct respQ_msg_t *rsp_msg,
-			  enum ib_event_type ib_event,
-			  int send_term)
-{
-	struct ib_event event;
-	struct iwch_qp_attributes attrs;
-	struct iwch_qp *qhp;
-	unsigned long flag;
-
-	xa_lock(&rnicp->qps);
-	qhp = xa_load(&rnicp->qps, CQE_QPID(rsp_msg->cqe));
-
-	if (!qhp) {
-		pr_err("%s unaffiliated error 0x%x qpid 0x%x\n",
-		       __func__, CQE_STATUS(rsp_msg->cqe),
-		       CQE_QPID(rsp_msg->cqe));
-		xa_unlock(&rnicp->qps);
-		return;
-	}
-
-	if ((qhp->attr.state == IWCH_QP_STATE_ERROR) ||
-	    (qhp->attr.state == IWCH_QP_STATE_TERMINATE)) {
-		pr_debug("%s AE received after RTS - qp state %d qpid 0x%x status 0x%x\n",
-			 __func__,
-			 qhp->attr.state, qhp->wq.qpid,
-			 CQE_STATUS(rsp_msg->cqe));
-		xa_unlock(&rnicp->qps);
-		return;
-	}
-
-	pr_err("%s - AE qpid 0x%x opcode %d status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
-	       __func__,
-	       CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe),
-	       CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe),
-	       CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
-
-	atomic_inc(&qhp->refcnt);
-	xa_unlock(&rnicp->qps);
-
-	if (qhp->attr.state == IWCH_QP_STATE_RTS) {
-		attrs.next_state = IWCH_QP_STATE_TERMINATE;
-		iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE,
-			       &attrs, 1);
-		if (send_term)
-			iwch_post_terminate(qhp, rsp_msg);
-	}
-
-	event.event = ib_event;
-	event.device = chp->ibcq.device;
-	if (ib_event == IB_EVENT_CQ_ERR)
-		event.element.cq = &chp->ibcq;
-	else
-		event.element.qp = &qhp->ibqp;
-
-	if (qhp->ibqp.event_handler)
-		(*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context);
-
-	spin_lock_irqsave(&chp->comp_handler_lock, flag);
-	(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
-	spin_unlock_irqrestore(&chp->comp_handler_lock, flag);
-
-	if (atomic_dec_and_test(&qhp->refcnt))
-		wake_up(&qhp->wait);
-}
-
-void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb)
-{
-	struct iwch_dev *rnicp;
-	struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data;
-	struct iwch_cq *chp;
-	struct iwch_qp *qhp;
-	u32 cqid = RSPQ_CQID(rsp_msg);
-	unsigned long flag;
-
-	rnicp = (struct iwch_dev *) rdev_p->ulp;
-	xa_lock(&rnicp->qps);
-	chp = get_chp(rnicp, cqid);
-	qhp = xa_load(&rnicp->qps, CQE_QPID(rsp_msg->cqe));
-	if (!chp || !qhp) {
-		pr_err("BAD AE cqid 0x%x qpid 0x%x opcode %d status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
-		       cqid, CQE_QPID(rsp_msg->cqe),
-		       CQE_OPCODE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
-		       CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe),
-		       CQE_WRID_LOW(rsp_msg->cqe));
-		xa_unlock(&rnicp->qps);
-		goto out;
-	}
-	iwch_qp_add_ref(&qhp->ibqp);
-	atomic_inc(&chp->refcnt);
-	xa_unlock(&rnicp->qps);
-
-	/*
-	 * 1) completion of our sending a TERMINATE.
-	 * 2) incoming TERMINATE message.
-	 */
-	if ((CQE_OPCODE(rsp_msg->cqe) == T3_TERMINATE) &&
-	    (CQE_STATUS(rsp_msg->cqe) == 0)) {
-		if (SQ_TYPE(rsp_msg->cqe)) {
-			pr_debug("%s QPID 0x%x ep %p disconnecting\n",
-				 __func__, qhp->wq.qpid, qhp->ep);
-			iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC);
-		} else {
-			pr_debug("%s post REQ_ERR AE QPID 0x%x\n", __func__,
-				 qhp->wq.qpid);
-			post_qp_event(rnicp, chp, rsp_msg,
-				      IB_EVENT_QP_REQ_ERR, 0);
-			iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC);
-		}
-		goto done;
-	}
-
-	/* Bad incoming Read request */
-	if (SQ_TYPE(rsp_msg->cqe) &&
-	    (CQE_OPCODE(rsp_msg->cqe) == T3_READ_RESP)) {
-		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1);
-		goto done;
-	}
-
-	/* Bad incoming write */
-	if (RQ_TYPE(rsp_msg->cqe) &&
-	    (CQE_OPCODE(rsp_msg->cqe) == T3_RDMA_WRITE)) {
-		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1);
-		goto done;
-	}
-
-	switch (CQE_STATUS(rsp_msg->cqe)) {
-
-	/* Completion Events */
-	case TPT_ERR_SUCCESS:
-
-		/*
-		 * Confirm the destination entry if this is a RECV completion.
-		 */
-		if (qhp->ep && SQ_TYPE(rsp_msg->cqe))
-			dst_confirm(qhp->ep->dst);
-		spin_lock_irqsave(&chp->comp_handler_lock, flag);
-		(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
-		spin_unlock_irqrestore(&chp->comp_handler_lock, flag);
-		break;
-
-	case TPT_ERR_STAG:
-	case TPT_ERR_PDID:
-	case TPT_ERR_QPID:
-	case TPT_ERR_ACCESS:
-	case TPT_ERR_WRAP:
-	case TPT_ERR_BOUND:
-	case TPT_ERR_INVALIDATE_SHARED_MR:
-	case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
-		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_ACCESS_ERR, 1);
-		break;
-
-	/* Device Fatal Errors */
-	case TPT_ERR_ECC:
-	case TPT_ERR_ECC_PSTAG:
-	case TPT_ERR_INTERNAL_ERR:
-		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_DEVICE_FATAL, 1);
-		break;
-
-	/* QP Fatal Errors */
-	case TPT_ERR_OUT_OF_RQE:
-	case TPT_ERR_PBL_ADDR_BOUND:
-	case TPT_ERR_CRC:
-	case TPT_ERR_MARKER:
-	case TPT_ERR_PDU_LEN_ERR:
-	case TPT_ERR_DDP_VERSION:
-	case TPT_ERR_RDMA_VERSION:
-	case TPT_ERR_OPCODE:
-	case TPT_ERR_DDP_QUEUE_NUM:
-	case TPT_ERR_MSN:
-	case TPT_ERR_TBIT:
-	case TPT_ERR_MO:
-	case TPT_ERR_MSN_GAP:
-	case TPT_ERR_MSN_RANGE:
-	case TPT_ERR_RQE_ADDR_BOUND:
-	case TPT_ERR_IRD_OVERFLOW:
-		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1);
-		break;
-
-	default:
-		pr_err("Unknown T3 status 0x%x QPID 0x%x\n",
-		       CQE_STATUS(rsp_msg->cqe), qhp->wq.qpid);
-		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1);
-		break;
-	}
-done:
-	if (atomic_dec_and_test(&chp->refcnt))
-	        wake_up(&chp->wait);
-	iwch_qp_rem_ref(&qhp->ibqp);
-out:
-	dev_kfree_skb_irq(skb);
-}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c
deleted file mode 100644
index ce0f274..0000000
--- a/drivers/infiniband/hw/cxgb3/iwch_mem.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/slab.h>
-#include <asm/byteorder.h>
-
-#include <rdma/iw_cm.h>
-#include <rdma/ib_verbs.h>
-
-#include "cxio_hal.h"
-#include "cxio_resource.h"
-#include "iwch.h"
-#include "iwch_provider.h"
-
-static int iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag)
-{
-	u32 mmid;
-
-	mhp->attr.state = 1;
-	mhp->attr.stag = stag;
-	mmid = stag >> 8;
-	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
-	pr_debug("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp);
-	return xa_insert_irq(&mhp->rhp->mrs, mmid, mhp, GFP_KERNEL);
-}
-
-int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
-		      struct iwch_mr *mhp, int shift)
-{
-	u32 stag;
-	int ret;
-
-	if (cxio_register_phys_mem(&rhp->rdev,
-				   &stag, mhp->attr.pdid,
-				   mhp->attr.perms,
-				   mhp->attr.zbva,
-				   mhp->attr.va_fbo,
-				   mhp->attr.len,
-				   shift - 12,
-				   mhp->attr.pbl_size, mhp->attr.pbl_addr))
-		return -ENOMEM;
-
-	ret = iwch_finish_mem_reg(mhp, stag);
-	if (ret)
-		cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
-		       mhp->attr.pbl_addr);
-	return ret;
-}
-
-int iwch_alloc_pbl(struct iwch_mr *mhp, int npages)
-{
-	mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev,
-						    npages << 3);
-
-	if (!mhp->attr.pbl_addr)
-		return -ENOMEM;
-
-	mhp->attr.pbl_size = npages;
-
-	return 0;
-}
-
-void iwch_free_pbl(struct iwch_mr *mhp)
-{
-	cxio_hal_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
-			      mhp->attr.pbl_size << 3);
-}
-
-int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset)
-{
-	return cxio_write_pbl(&mhp->rhp->rdev, pages,
-			      mhp->attr.pbl_addr + (offset << 3), npages);
-}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
deleted file mode 100644
index dcf02ec..0000000
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ /dev/null
@@ -1,1321 +0,0 @@
-/*
- * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/device.h>
-#include <linux/netdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/delay.h>
-#include <linux/errno.h>
-#include <linux/list.h>
-#include <linux/sched/mm.h>
-#include <linux/spinlock.h>
-#include <linux/ethtool.h>
-#include <linux/rtnetlink.h>
-#include <linux/inetdevice.h>
-#include <linux/slab.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/byteorder.h>
-
-#include <rdma/iw_cm.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/ib_smi.h>
-#include <rdma/ib_umem.h>
-#include <rdma/ib_user_verbs.h>
-#include <rdma/uverbs_ioctl.h>
-
-#include "cxio_hal.h"
-#include "iwch.h"
-#include "iwch_provider.h"
-#include "iwch_cm.h"
-#include <rdma/cxgb3-abi.h>
-#include "common.h"
-
-static void iwch_dealloc_ucontext(struct ib_ucontext *context)
-{
-	struct iwch_dev *rhp = to_iwch_dev(context->device);
-	struct iwch_ucontext *ucontext = to_iwch_ucontext(context);
-	struct iwch_mm_entry *mm, *tmp;
-
-	pr_debug("%s context %p\n", __func__, context);
-	list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
-		kfree(mm);
-	cxio_release_ucontext(&rhp->rdev, &ucontext->uctx);
-}
-
-static int iwch_alloc_ucontext(struct ib_ucontext *ucontext,
-			       struct ib_udata *udata)
-{
-	struct ib_device *ibdev = ucontext->device;
-	struct iwch_ucontext *context = to_iwch_ucontext(ucontext);
-	struct iwch_dev *rhp = to_iwch_dev(ibdev);
-
-	pr_debug("%s ibdev %p\n", __func__, ibdev);
-	cxio_init_ucontext(&rhp->rdev, &context->uctx);
-	INIT_LIST_HEAD(&context->mmaps);
-	spin_lock_init(&context->mmap_lock);
-	return 0;
-}
-
-static void iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
-{
-	struct iwch_cq *chp;
-
-	pr_debug("%s ib_cq %p\n", __func__, ib_cq);
-	chp = to_iwch_cq(ib_cq);
-
-	xa_erase_irq(&chp->rhp->cqs, chp->cq.cqid);
-	atomic_dec(&chp->refcnt);
-	wait_event(chp->wait, !atomic_read(&chp->refcnt));
-
-	cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
-}
-
-static int iwch_create_cq(struct ib_cq *ibcq,
-			  const struct ib_cq_init_attr *attr,
-			  struct ib_udata *udata)
-{
-	struct ib_device *ibdev = ibcq->device;
-	int entries = attr->cqe;
-	struct iwch_dev *rhp = to_iwch_dev(ibcq->device);
-	struct iwch_cq *chp = to_iwch_cq(ibcq);
-	struct iwch_create_cq_resp uresp;
-	struct iwch_create_cq_req ureq;
-	static int warned;
-	size_t resplen;
-
-	pr_debug("%s ib_dev %p entries %d\n", __func__, ibdev, entries);
-	if (attr->flags)
-		return -EINVAL;
-
-	if (udata) {
-		if (!t3a_device(rhp)) {
-			if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
-				return  -EFAULT;
-
-			chp->user_rptr_addr = (u32 __user *)(unsigned long)ureq.user_rptr_addr;
-		}
-	}
-
-	if (t3a_device(rhp)) {
-
-		/*
-		 * T3A: Add some fluff to handle extra CQEs inserted
-		 * for various errors.
-		 * Additional CQE possibilities:
-		 *      TERMINATE,
-		 *      incoming RDMA WRITE Failures
-		 *      incoming RDMA READ REQUEST FAILUREs
-		 * NOTE: We cannot ensure the CQ won't overflow.
-		 */
-		entries += 16;
-	}
-	entries = roundup_pow_of_two(entries);
-	chp->cq.size_log2 = ilog2(entries);
-
-	if (cxio_create_cq(&rhp->rdev, &chp->cq, !udata))
-		return -ENOMEM;
-
-	chp->rhp = rhp;
-	chp->ibcq.cqe = 1 << chp->cq.size_log2;
-	spin_lock_init(&chp->lock);
-	spin_lock_init(&chp->comp_handler_lock);
-	atomic_set(&chp->refcnt, 1);
-	init_waitqueue_head(&chp->wait);
-	if (xa_store_irq(&rhp->cqs, chp->cq.cqid, chp, GFP_KERNEL)) {
-		cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
-		return -ENOMEM;
-	}
-
-	if (udata) {
-		struct iwch_mm_entry *mm;
-		struct iwch_ucontext *ucontext = rdma_udata_to_drv_context(
-			udata, struct iwch_ucontext, ibucontext);
-
-		mm = kmalloc(sizeof(*mm), GFP_KERNEL);
-		if (!mm) {
-			iwch_destroy_cq(&chp->ibcq, udata);
-			return -ENOMEM;
-		}
-		uresp.cqid = chp->cq.cqid;
-		uresp.size_log2 = chp->cq.size_log2;
-		spin_lock(&ucontext->mmap_lock);
-		uresp.key = ucontext->key;
-		ucontext->key += PAGE_SIZE;
-		spin_unlock(&ucontext->mmap_lock);
-		mm->key = uresp.key;
-		mm->addr = virt_to_phys(chp->cq.queue);
-		if (udata->outlen < sizeof(uresp)) {
-			if (!warned++)
-				pr_warn("Warning - downlevel libcxgb3 (non-fatal)\n");
-			mm->len = PAGE_ALIGN((1UL << uresp.size_log2) *
-					     sizeof(struct t3_cqe));
-			resplen = sizeof(struct iwch_create_cq_resp_v0);
-		} else {
-			mm->len = PAGE_ALIGN(((1UL << uresp.size_log2) + 1) *
-					     sizeof(struct t3_cqe));
-			uresp.memsize = mm->len;
-			uresp.reserved = 0;
-			resplen = sizeof(uresp);
-		}
-		if (ib_copy_to_udata(udata, &uresp, resplen)) {
-			kfree(mm);
-			iwch_destroy_cq(&chp->ibcq, udata);
-			return -EFAULT;
-		}
-		insert_mmap(ucontext, mm);
-	}
-	pr_debug("created cqid 0x%0x chp %p size 0x%0x, dma_addr %pad\n",
-		 chp->cq.cqid, chp, (1 << chp->cq.size_log2),
-		 &chp->cq.dma_addr);
-	return 0;
-}
-
-static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
-{
-	struct iwch_dev *rhp;
-	struct iwch_cq *chp;
-	enum t3_cq_opcode cq_op;
-	int err;
-	unsigned long flag;
-	u32 rptr;
-
-	chp = to_iwch_cq(ibcq);
-	rhp = chp->rhp;
-	if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
-		cq_op = CQ_ARM_SE;
-	else
-		cq_op = CQ_ARM_AN;
-	if (chp->user_rptr_addr) {
-		if (get_user(rptr, chp->user_rptr_addr))
-			return -EFAULT;
-		spin_lock_irqsave(&chp->lock, flag);
-		chp->cq.rptr = rptr;
-	} else
-		spin_lock_irqsave(&chp->lock, flag);
-	pr_debug("%s rptr 0x%x\n", __func__, chp->cq.rptr);
-	err = cxio_hal_cq_op(&rhp->rdev, &chp->cq, cq_op, 0);
-	spin_unlock_irqrestore(&chp->lock, flag);
-	if (err < 0)
-		pr_err("Error %d rearming CQID 0x%x\n", err, chp->cq.cqid);
-	if (err > 0 && !(flags & IB_CQ_REPORT_MISSED_EVENTS))
-		err = 0;
-	return err;
-}
-
-static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
-{
-	int len = vma->vm_end - vma->vm_start;
-	u32 key = vma->vm_pgoff << PAGE_SHIFT;
-	struct cxio_rdev *rdev_p;
-	int ret = 0;
-	struct iwch_mm_entry *mm;
-	struct iwch_ucontext *ucontext;
-	u64 addr;
-
-	pr_debug("%s pgoff 0x%lx key 0x%x len %d\n", __func__, vma->vm_pgoff,
-		 key, len);
-
-	if (vma->vm_start & (PAGE_SIZE-1)) {
-	        return -EINVAL;
-	}
-
-	rdev_p = &(to_iwch_dev(context->device)->rdev);
-	ucontext = to_iwch_ucontext(context);
-
-	mm = remove_mmap(ucontext, key, len);
-	if (!mm)
-		return -EINVAL;
-	addr = mm->addr;
-	kfree(mm);
-
-	if ((addr >= rdev_p->rnic_info.udbell_physbase) &&
-	    (addr < (rdev_p->rnic_info.udbell_physbase +
-		       rdev_p->rnic_info.udbell_len))) {
-
-		/*
-		 * Map T3 DB register.
-		 */
-		if (vma->vm_flags & VM_READ) {
-			return -EPERM;
-		}
-
-		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
-		vma->vm_flags &= ~VM_MAYREAD;
-		ret = io_remap_pfn_range(vma, vma->vm_start,
-					 addr >> PAGE_SHIFT,
-				         len, vma->vm_page_prot);
-	} else {
-
-		/*
-		 * Map WQ or CQ contig dma memory...
-		 */
-		ret = remap_pfn_range(vma, vma->vm_start,
-				      addr >> PAGE_SHIFT,
-				      len, vma->vm_page_prot);
-	}
-
-	return ret;
-}
-
-static void iwch_deallocate_pd(struct ib_pd *pd, struct ib_udata *udata)
-{
-	struct iwch_dev *rhp;
-	struct iwch_pd *php;
-
-	php = to_iwch_pd(pd);
-	rhp = php->rhp;
-	pr_debug("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid);
-	cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid);
-}
-
-static int iwch_allocate_pd(struct ib_pd *pd, struct ib_udata *udata)
-{
-	struct iwch_pd *php = to_iwch_pd(pd);
-	struct ib_device *ibdev = pd->device;
-	u32 pdid;
-	struct iwch_dev *rhp;
-
-	pr_debug("%s ibdev %p\n", __func__, ibdev);
-	rhp = (struct iwch_dev *) ibdev;
-	pdid = cxio_hal_get_pdid(rhp->rdev.rscp);
-	if (!pdid)
-		return -EINVAL;
-
-	php->pdid = pdid;
-	php->rhp = rhp;
-	if (udata) {
-		struct iwch_alloc_pd_resp resp = {.pdid = php->pdid};
-
-		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
-			iwch_deallocate_pd(&php->ibpd, udata);
-			return -EFAULT;
-		}
-	}
-	pr_debug("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php);
-	return 0;
-}
-
-static int iwch_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
-{
-	struct iwch_dev *rhp;
-	struct iwch_mr *mhp;
-	u32 mmid;
-
-	pr_debug("%s ib_mr %p\n", __func__, ib_mr);
-
-	mhp = to_iwch_mr(ib_mr);
-	kfree(mhp->pages);
-	rhp = mhp->rhp;
-	mmid = mhp->attr.stag >> 8;
-	cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
-		       mhp->attr.pbl_addr);
-	iwch_free_pbl(mhp);
-	xa_erase_irq(&rhp->mrs, mmid);
-	if (mhp->kva)
-		kfree((void *) (unsigned long) mhp->kva);
-	ib_umem_release(mhp->umem);
-	pr_debug("%s mmid 0x%x ptr %p\n", __func__, mmid, mhp);
-	kfree(mhp);
-	return 0;
-}
-
-static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
-{
-	const u64 total_size = 0xffffffff;
-	const u64 mask = (total_size + PAGE_SIZE - 1) & PAGE_MASK;
-	struct iwch_pd *php = to_iwch_pd(pd);
-	struct iwch_dev *rhp = php->rhp;
-	struct iwch_mr *mhp;
-	__be64 *page_list;
-	int shift = 26, npages, ret, i;
-
-	pr_debug("%s ib_pd %p\n", __func__, pd);
-
-	/*
-	 * T3 only supports 32 bits of size.
-	 */
-	if (sizeof(phys_addr_t) > 4) {
-		pr_warn_once("Cannot support dma_mrs on this platform\n");
-		return ERR_PTR(-ENOTSUPP);
-	}
-
-	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
-	if (!mhp)
-		return ERR_PTR(-ENOMEM);
-
-	mhp->rhp = rhp;
-
-	npages = (total_size + (1ULL << shift) - 1) >> shift;
-	if (!npages) {
-		ret = -EINVAL;
-		goto err;
-	}
-
-	page_list = kmalloc_array(npages, sizeof(u64), GFP_KERNEL);
-	if (!page_list) {
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	for (i = 0; i < npages; i++)
-		page_list[i] = cpu_to_be64((u64)i << shift);
-
-	pr_debug("%s mask 0x%llx shift %d len %lld pbl_size %d\n",
-		 __func__, mask, shift, total_size, npages);
-
-	ret = iwch_alloc_pbl(mhp, npages);
-	if (ret) {
-		kfree(page_list);
-		goto err_pbl;
-	}
-
-	ret = iwch_write_pbl(mhp, page_list, npages, 0);
-	kfree(page_list);
-	if (ret)
-		goto err_pbl;
-
-	mhp->attr.pdid = php->pdid;
-	mhp->attr.zbva = 0;
-
-	mhp->attr.perms = iwch_ib_to_tpt_access(acc);
-	mhp->attr.va_fbo = 0;
-	mhp->attr.page_size = shift - 12;
-
-	mhp->attr.len = (u32) total_size;
-	mhp->attr.pbl_size = npages;
-	ret = iwch_register_mem(rhp, php, mhp, shift);
-	if (ret)
-		goto err_pbl;
-
-	return &mhp->ibmr;
-
-err_pbl:
-	iwch_free_pbl(mhp);
-
-err:
-	kfree(mhp);
-	return ERR_PTR(ret);
-}
-
-static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
-				      u64 virt, int acc, struct ib_udata *udata)
-{
-	__be64 *pages;
-	int shift, n, i;
-	int err = 0;
-	struct iwch_dev *rhp;
-	struct iwch_pd *php;
-	struct iwch_mr *mhp;
-	struct iwch_reg_user_mr_resp uresp;
-	struct sg_dma_page_iter sg_iter;
-	pr_debug("%s ib_pd %p\n", __func__, pd);
-
-	php = to_iwch_pd(pd);
-	rhp = php->rhp;
-	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
-	if (!mhp)
-		return ERR_PTR(-ENOMEM);
-
-	mhp->rhp = rhp;
-
-	mhp->umem = ib_umem_get(udata, start, length, acc, 0);
-	if (IS_ERR(mhp->umem)) {
-		err = PTR_ERR(mhp->umem);
-		kfree(mhp);
-		return ERR_PTR(err);
-	}
-
-	shift = PAGE_SHIFT;
-
-	n = ib_umem_num_pages(mhp->umem);
-
-	err = iwch_alloc_pbl(mhp, n);
-	if (err)
-		goto err;
-
-	pages = (__be64 *) __get_free_page(GFP_KERNEL);
-	if (!pages) {
-		err = -ENOMEM;
-		goto err_pbl;
-	}
-
-	i = n = 0;
-
-	for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) {
-		pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter));
-		if (i == PAGE_SIZE / sizeof(*pages)) {
-			err = iwch_write_pbl(mhp, pages, i, n);
-			if (err)
-				goto pbl_done;
-			n += i;
-			i = 0;
-		}
-	}
-
-	if (i)
-		err = iwch_write_pbl(mhp, pages, i, n);
-
-pbl_done:
-	free_page((unsigned long) pages);
-	if (err)
-		goto err_pbl;
-
-	mhp->attr.pdid = php->pdid;
-	mhp->attr.zbva = 0;
-	mhp->attr.perms = iwch_ib_to_tpt_access(acc);
-	mhp->attr.va_fbo = virt;
-	mhp->attr.page_size = shift - 12;
-	mhp->attr.len = (u32) length;
-
-	err = iwch_register_mem(rhp, php, mhp, shift);
-	if (err)
-		goto err_pbl;
-
-	if (udata && !t3a_device(rhp)) {
-		uresp.pbl_addr = (mhp->attr.pbl_addr -
-				 rhp->rdev.rnic_info.pbl_base) >> 3;
-		pr_debug("%s user resp pbl_addr 0x%x\n", __func__,
-			 uresp.pbl_addr);
-
-		if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
-			iwch_dereg_mr(&mhp->ibmr, udata);
-			err = -EFAULT;
-			goto err;
-		}
-	}
-
-	return &mhp->ibmr;
-
-err_pbl:
-	iwch_free_pbl(mhp);
-
-err:
-	ib_umem_release(mhp->umem);
-	kfree(mhp);
-	return ERR_PTR(err);
-}
-
-static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
-				   struct ib_udata *udata)
-{
-	struct iwch_dev *rhp;
-	struct iwch_pd *php;
-	struct iwch_mw *mhp;
-	u32 mmid;
-	u32 stag = 0;
-	int ret;
-
-	if (type != IB_MW_TYPE_1)
-		return ERR_PTR(-EINVAL);
-
-	php = to_iwch_pd(pd);
-	rhp = php->rhp;
-	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
-	if (!mhp)
-		return ERR_PTR(-ENOMEM);
-	ret = cxio_allocate_window(&rhp->rdev, &stag, php->pdid);
-	if (ret) {
-		kfree(mhp);
-		return ERR_PTR(ret);
-	}
-	mhp->rhp = rhp;
-	mhp->attr.pdid = php->pdid;
-	mhp->attr.type = TPT_MW;
-	mhp->attr.stag = stag;
-	mmid = (stag) >> 8;
-	mhp->ibmw.rkey = stag;
-	if (xa_insert_irq(&rhp->mrs, mmid, mhp, GFP_KERNEL)) {
-		cxio_deallocate_window(&rhp->rdev, mhp->attr.stag);
-		kfree(mhp);
-		return ERR_PTR(-ENOMEM);
-	}
-	pr_debug("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
-	return &(mhp->ibmw);
-}
-
-static int iwch_dealloc_mw(struct ib_mw *mw)
-{
-	struct iwch_dev *rhp;
-	struct iwch_mw *mhp;
-	u32 mmid;
-
-	mhp = to_iwch_mw(mw);
-	rhp = mhp->rhp;
-	mmid = (mw->rkey) >> 8;
-	cxio_deallocate_window(&rhp->rdev, mhp->attr.stag);
-	xa_erase_irq(&rhp->mrs, mmid);
-	pr_debug("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp);
-	kfree(mhp);
-	return 0;
-}
-
-static struct ib_mr *iwch_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-				   u32 max_num_sg, struct ib_udata *udata)
-{
-	struct iwch_dev *rhp;
-	struct iwch_pd *php;
-	struct iwch_mr *mhp;
-	u32 mmid;
-	u32 stag = 0;
-	int ret = -ENOMEM;
-
-	if (mr_type != IB_MR_TYPE_MEM_REG ||
-	    max_num_sg > T3_MAX_FASTREG_DEPTH)
-		return ERR_PTR(-EINVAL);
-
-	php = to_iwch_pd(pd);
-	rhp = php->rhp;
-	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
-	if (!mhp)
-		goto err;
-
-	mhp->pages = kcalloc(max_num_sg, sizeof(u64), GFP_KERNEL);
-	if (!mhp->pages)
-		goto pl_err;
-
-	mhp->rhp = rhp;
-	ret = iwch_alloc_pbl(mhp, max_num_sg);
-	if (ret)
-		goto err1;
-	mhp->attr.pbl_size = max_num_sg;
-	ret = cxio_allocate_stag(&rhp->rdev, &stag, php->pdid,
-				 mhp->attr.pbl_size, mhp->attr.pbl_addr);
-	if (ret)
-		goto err2;
-	mhp->attr.pdid = php->pdid;
-	mhp->attr.type = TPT_NON_SHARED_MR;
-	mhp->attr.stag = stag;
-	mhp->attr.state = 1;
-	mmid = (stag) >> 8;
-	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
-	ret = xa_insert_irq(&rhp->mrs, mmid, mhp, GFP_KERNEL);
-	if (ret)
-		goto err3;
-
-	pr_debug("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
-	return &(mhp->ibmr);
-err3:
-	cxio_dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size,
-		       mhp->attr.pbl_addr);
-err2:
-	iwch_free_pbl(mhp);
-err1:
-	kfree(mhp->pages);
-pl_err:
-	kfree(mhp);
-err:
-	return ERR_PTR(ret);
-}
-
-static int iwch_set_page(struct ib_mr *ibmr, u64 addr)
-{
-	struct iwch_mr *mhp = to_iwch_mr(ibmr);
-
-	if (unlikely(mhp->npages == mhp->attr.pbl_size))
-		return -ENOMEM;
-
-	mhp->pages[mhp->npages++] = addr;
-
-	return 0;
-}
-
-static int iwch_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
-			  int sg_nents, unsigned int *sg_offset)
-{
-	struct iwch_mr *mhp = to_iwch_mr(ibmr);
-
-	mhp->npages = 0;
-
-	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, iwch_set_page);
-}
-
-static int iwch_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
-{
-	struct iwch_dev *rhp;
-	struct iwch_qp *qhp;
-	struct iwch_qp_attributes attrs;
-	struct iwch_ucontext *ucontext;
-
-	qhp = to_iwch_qp(ib_qp);
-	rhp = qhp->rhp;
-
-	attrs.next_state = IWCH_QP_STATE_ERROR;
-	iwch_modify_qp(rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 0);
-	wait_event(qhp->wait, !qhp->ep);
-
-	xa_erase_irq(&rhp->qps, qhp->wq.qpid);
-
-	atomic_dec(&qhp->refcnt);
-	wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
-
-	ucontext = rdma_udata_to_drv_context(udata, struct iwch_ucontext,
-					     ibucontext);
-	cxio_destroy_qp(&rhp->rdev, &qhp->wq,
-			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
-
-	pr_debug("%s ib_qp %p qpid 0x%0x qhp %p\n", __func__,
-		 ib_qp, qhp->wq.qpid, qhp);
-	kfree(qhp);
-	return 0;
-}
-
-static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
-			     struct ib_qp_init_attr *attrs,
-			     struct ib_udata *udata)
-{
-	struct iwch_dev *rhp;
-	struct iwch_qp *qhp;
-	struct iwch_pd *php;
-	struct iwch_cq *schp;
-	struct iwch_cq *rchp;
-	struct iwch_create_qp_resp uresp;
-	int wqsize, sqsize, rqsize;
-	struct iwch_ucontext *ucontext;
-
-	pr_debug("%s ib_pd %p\n", __func__, pd);
-	if (attrs->qp_type != IB_QPT_RC)
-		return ERR_PTR(-EINVAL);
-	php = to_iwch_pd(pd);
-	rhp = php->rhp;
-	schp = get_chp(rhp, ((struct iwch_cq *) attrs->send_cq)->cq.cqid);
-	rchp = get_chp(rhp, ((struct iwch_cq *) attrs->recv_cq)->cq.cqid);
-	if (!schp || !rchp)
-		return ERR_PTR(-EINVAL);
-
-	/* The RQT size must be # of entries + 1 rounded up to a power of two */
-	rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr);
-	if (rqsize == attrs->cap.max_recv_wr)
-		rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr+1);
-
-	/* T3 doesn't support RQT depth < 16 */
-	if (rqsize < 16)
-		rqsize = 16;
-
-	if (rqsize > T3_MAX_RQ_SIZE)
-		return ERR_PTR(-EINVAL);
-
-	if (attrs->cap.max_inline_data > T3_MAX_INLINE)
-		return ERR_PTR(-EINVAL);
-
-	/*
-	 * NOTE: The SQ and total WQ sizes don't need to be
-	 * a power of two.  However, all the code assumes
-	 * they are. EG: Q_FREECNT() and friends.
-	 */
-	sqsize = roundup_pow_of_two(attrs->cap.max_send_wr);
-	wqsize = roundup_pow_of_two(rqsize + sqsize);
-
-	/*
-	 * Kernel users need more wq space for fastreg WRs which can take
-	 * 2 WR fragments.
-	 */
-	ucontext = rdma_udata_to_drv_context(udata, struct iwch_ucontext,
-					     ibucontext);
-	if (!ucontext && wqsize < (rqsize + (2 * sqsize)))
-		wqsize = roundup_pow_of_two(rqsize +
-				roundup_pow_of_two(attrs->cap.max_send_wr * 2));
-	pr_debug("%s wqsize %d sqsize %d rqsize %d\n", __func__,
-		 wqsize, sqsize, rqsize);
-	qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
-	if (!qhp)
-		return ERR_PTR(-ENOMEM);
-	qhp->wq.size_log2 = ilog2(wqsize);
-	qhp->wq.rq_size_log2 = ilog2(rqsize);
-	qhp->wq.sq_size_log2 = ilog2(sqsize);
-	if (cxio_create_qp(&rhp->rdev, !udata, &qhp->wq,
-			   ucontext ? &ucontext->uctx : &rhp->rdev.uctx)) {
-		kfree(qhp);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	attrs->cap.max_recv_wr = rqsize - 1;
-	attrs->cap.max_send_wr = sqsize;
-	attrs->cap.max_inline_data = T3_MAX_INLINE;
-
-	qhp->rhp = rhp;
-	qhp->attr.pd = php->pdid;
-	qhp->attr.scq = ((struct iwch_cq *) attrs->send_cq)->cq.cqid;
-	qhp->attr.rcq = ((struct iwch_cq *) attrs->recv_cq)->cq.cqid;
-	qhp->attr.sq_num_entries = attrs->cap.max_send_wr;
-	qhp->attr.rq_num_entries = attrs->cap.max_recv_wr;
-	qhp->attr.sq_max_sges = attrs->cap.max_send_sge;
-	qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge;
-	qhp->attr.rq_max_sges = attrs->cap.max_recv_sge;
-	qhp->attr.state = IWCH_QP_STATE_IDLE;
-	qhp->attr.next_state = IWCH_QP_STATE_IDLE;
-
-	/*
-	 * XXX - These don't get passed in from the openib user
-	 * at create time.  The CM sets them via a QP modify.
-	 * Need to fix...  I think the CM should
-	 */
-	qhp->attr.enable_rdma_read = 1;
-	qhp->attr.enable_rdma_write = 1;
-	qhp->attr.enable_bind = 1;
-	qhp->attr.max_ord = 1;
-	qhp->attr.max_ird = 1;
-
-	spin_lock_init(&qhp->lock);
-	init_waitqueue_head(&qhp->wait);
-	atomic_set(&qhp->refcnt, 1);
-
-	if (xa_store_irq(&rhp->qps, qhp->wq.qpid, qhp, GFP_KERNEL)) {
-		cxio_destroy_qp(&rhp->rdev, &qhp->wq,
-			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
-		kfree(qhp);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	if (udata) {
-
-		struct iwch_mm_entry *mm1, *mm2;
-
-		mm1 = kmalloc(sizeof(*mm1), GFP_KERNEL);
-		if (!mm1) {
-			iwch_destroy_qp(&qhp->ibqp, udata);
-			return ERR_PTR(-ENOMEM);
-		}
-
-		mm2 = kmalloc(sizeof(*mm2), GFP_KERNEL);
-		if (!mm2) {
-			kfree(mm1);
-			iwch_destroy_qp(&qhp->ibqp, udata);
-			return ERR_PTR(-ENOMEM);
-		}
-
-		uresp.qpid = qhp->wq.qpid;
-		uresp.size_log2 = qhp->wq.size_log2;
-		uresp.sq_size_log2 = qhp->wq.sq_size_log2;
-		uresp.rq_size_log2 = qhp->wq.rq_size_log2;
-		spin_lock(&ucontext->mmap_lock);
-		uresp.key = ucontext->key;
-		ucontext->key += PAGE_SIZE;
-		uresp.db_key = ucontext->key;
-		ucontext->key += PAGE_SIZE;
-		spin_unlock(&ucontext->mmap_lock);
-		if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
-			kfree(mm1);
-			kfree(mm2);
-			iwch_destroy_qp(&qhp->ibqp, udata);
-			return ERR_PTR(-EFAULT);
-		}
-		mm1->key = uresp.key;
-		mm1->addr = virt_to_phys(qhp->wq.queue);
-		mm1->len = PAGE_ALIGN(wqsize * sizeof(union t3_wr));
-		insert_mmap(ucontext, mm1);
-		mm2->key = uresp.db_key;
-		mm2->addr = qhp->wq.udb & PAGE_MASK;
-		mm2->len = PAGE_SIZE;
-		insert_mmap(ucontext, mm2);
-	}
-	qhp->ibqp.qp_num = qhp->wq.qpid;
-	pr_debug(
-		"%s sq_num_entries %d, rq_num_entries %d qpid 0x%0x qhp %p dma_addr %pad size %d rq_addr 0x%x\n",
-		__func__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries,
-		qhp->wq.qpid, qhp, &qhp->wq.dma_addr, 1 << qhp->wq.size_log2,
-		qhp->wq.rq_addr);
-	return &qhp->ibqp;
-}
-
-static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-		      int attr_mask, struct ib_udata *udata)
-{
-	struct iwch_dev *rhp;
-	struct iwch_qp *qhp;
-	enum iwch_qp_attr_mask mask = 0;
-	struct iwch_qp_attributes attrs = {};
-
-	pr_debug("%s ib_qp %p\n", __func__, ibqp);
-
-	/* iwarp does not support the RTR state */
-	if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR))
-		attr_mask &= ~IB_QP_STATE;
-
-	/* Make sure we still have something left to do */
-	if (!attr_mask)
-		return 0;
-
-	qhp = to_iwch_qp(ibqp);
-	rhp = qhp->rhp;
-
-	attrs.next_state = iwch_convert_state(attr->qp_state);
-	attrs.enable_rdma_read = (attr->qp_access_flags &
-			       IB_ACCESS_REMOTE_READ) ?  1 : 0;
-	attrs.enable_rdma_write = (attr->qp_access_flags &
-				IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
-	attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0;
-
-
-	mask |= (attr_mask & IB_QP_STATE) ? IWCH_QP_ATTR_NEXT_STATE : 0;
-	mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ?
-			(IWCH_QP_ATTR_ENABLE_RDMA_READ |
-			 IWCH_QP_ATTR_ENABLE_RDMA_WRITE |
-			 IWCH_QP_ATTR_ENABLE_RDMA_BIND) : 0;
-
-	return iwch_modify_qp(rhp, qhp, mask, &attrs, 0);
-}
-
-void iwch_qp_add_ref(struct ib_qp *qp)
-{
-	pr_debug("%s ib_qp %p\n", __func__, qp);
-	atomic_inc(&(to_iwch_qp(qp)->refcnt));
-}
-
-void iwch_qp_rem_ref(struct ib_qp *qp)
-{
-	pr_debug("%s ib_qp %p\n", __func__, qp);
-	if (atomic_dec_and_test(&(to_iwch_qp(qp)->refcnt)))
-	        wake_up(&(to_iwch_qp(qp)->wait));
-}
-
-static struct ib_qp *iwch_get_qp(struct ib_device *dev, int qpn)
-{
-	pr_debug("%s ib_dev %p qpn 0x%x\n", __func__, dev, qpn);
-	return (struct ib_qp *)get_qhp(to_iwch_dev(dev), qpn);
-}
-
-
-static int iwch_query_pkey(struct ib_device *ibdev,
-			   u8 port, u16 index, u16 * pkey)
-{
-	pr_debug("%s ibdev %p\n", __func__, ibdev);
-	*pkey = 0;
-	return 0;
-}
-
-static int iwch_query_gid(struct ib_device *ibdev, u8 port,
-			  int index, union ib_gid *gid)
-{
-	struct iwch_dev *dev;
-
-	pr_debug("%s ibdev %p, port %d, index %d, gid %p\n",
-		 __func__, ibdev, port, index, gid);
-	dev = to_iwch_dev(ibdev);
-	BUG_ON(port == 0 || port > 2);
-	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
-	memcpy(&(gid->raw[0]), dev->rdev.port_info.lldevs[port-1]->dev_addr, 6);
-	return 0;
-}
-
-static u64 fw_vers_string_to_u64(struct iwch_dev *iwch_dev)
-{
-	struct ethtool_drvinfo info;
-	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
-	char *cp, *next;
-	unsigned fw_maj, fw_min, fw_mic;
-
-	lldev->ethtool_ops->get_drvinfo(lldev, &info);
-
-	next = info.fw_version + 1;
-	cp = strsep(&next, ".");
-	sscanf(cp, "%i", &fw_maj);
-	cp = strsep(&next, ".");
-	sscanf(cp, "%i", &fw_min);
-	cp = strsep(&next, ".");
-	sscanf(cp, "%i", &fw_mic);
-
-	return (((u64)fw_maj & 0xffff) << 32) | ((fw_min & 0xffff) << 16) |
-	       (fw_mic & 0xffff);
-}
-
-static int iwch_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
-			     struct ib_udata *uhw)
-{
-
-	struct iwch_dev *dev;
-
-	pr_debug("%s ibdev %p\n", __func__, ibdev);
-
-	if (uhw->inlen || uhw->outlen)
-		return -EINVAL;
-
-	dev = to_iwch_dev(ibdev);
-	memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
-	props->hw_ver = dev->rdev.t3cdev_p->type;
-	props->fw_ver = fw_vers_string_to_u64(dev);
-	props->device_cap_flags = dev->device_cap_flags;
-	props->page_size_cap = dev->attr.mem_pgsizes_bitmask;
-	props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor;
-	props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device;
-	props->max_mr_size = dev->attr.max_mr_size;
-	props->max_qp = dev->attr.max_qps;
-	props->max_qp_wr = dev->attr.max_wrs;
-	props->max_send_sge = dev->attr.max_sge_per_wr;
-	props->max_recv_sge = dev->attr.max_sge_per_wr;
-	props->max_sge_rd = 1;
-	props->max_qp_rd_atom = dev->attr.max_rdma_reads_per_qp;
-	props->max_qp_init_rd_atom = dev->attr.max_rdma_reads_per_qp;
-	props->max_cq = dev->attr.max_cqs;
-	props->max_cqe = dev->attr.max_cqes_per_cq;
-	props->max_mr = dev->attr.max_mem_regs;
-	props->max_pd = dev->attr.max_pds;
-	props->local_ca_ack_delay = 0;
-	props->max_fast_reg_page_list_len = T3_MAX_FASTREG_DEPTH;
-
-	return 0;
-}
-
-static int iwch_query_port(struct ib_device *ibdev,
-			   u8 port, struct ib_port_attr *props)
-{
-	pr_debug("%s ibdev %p\n", __func__, ibdev);
-
-	props->port_cap_flags =
-	    IB_PORT_CM_SUP |
-	    IB_PORT_SNMP_TUNNEL_SUP |
-	    IB_PORT_REINIT_SUP |
-	    IB_PORT_DEVICE_MGMT_SUP |
-	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
-	props->gid_tbl_len = 1;
-	props->pkey_tbl_len = 1;
-	props->active_width = 2;
-	props->active_speed = IB_SPEED_DDR;
-	props->max_msg_sz = -1;
-
-	return 0;
-}
-
-static ssize_t hw_rev_show(struct device *dev,
-			   struct device_attribute *attr, char *buf)
-{
-	struct iwch_dev *iwch_dev =
-			rdma_device_to_drv_device(dev, struct iwch_dev, ibdev);
-
-	pr_debug("%s dev 0x%p\n", __func__, dev);
-	return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type);
-}
-static DEVICE_ATTR_RO(hw_rev);
-
-static ssize_t hca_type_show(struct device *dev,
-			     struct device_attribute *attr, char *buf)
-{
-	struct iwch_dev *iwch_dev =
-			rdma_device_to_drv_device(dev, struct iwch_dev, ibdev);
-	struct ethtool_drvinfo info;
-	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
-
-	pr_debug("%s dev 0x%p\n", __func__, dev);
-	lldev->ethtool_ops->get_drvinfo(lldev, &info);
-	return sprintf(buf, "%s\n", info.driver);
-}
-static DEVICE_ATTR_RO(hca_type);
-
-static ssize_t board_id_show(struct device *dev,
-			     struct device_attribute *attr, char *buf)
-{
-	struct iwch_dev *iwch_dev =
-			rdma_device_to_drv_device(dev, struct iwch_dev, ibdev);
-
-	pr_debug("%s dev 0x%p\n", __func__, dev);
-	return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor,
-		       iwch_dev->rdev.rnic_info.pdev->device);
-}
-static DEVICE_ATTR_RO(board_id);
-
-enum counters {
-	IPINRECEIVES,
-	IPINHDRERRORS,
-	IPINADDRERRORS,
-	IPINUNKNOWNPROTOS,
-	IPINDISCARDS,
-	IPINDELIVERS,
-	IPOUTREQUESTS,
-	IPOUTDISCARDS,
-	IPOUTNOROUTES,
-	IPREASMTIMEOUT,
-	IPREASMREQDS,
-	IPREASMOKS,
-	IPREASMFAILS,
-	TCPACTIVEOPENS,
-	TCPPASSIVEOPENS,
-	TCPATTEMPTFAILS,
-	TCPESTABRESETS,
-	TCPCURRESTAB,
-	TCPINSEGS,
-	TCPOUTSEGS,
-	TCPRETRANSSEGS,
-	TCPINERRS,
-	TCPOUTRSTS,
-	TCPRTOMIN,
-	TCPRTOMAX,
-	NR_COUNTERS
-};
-
-static const char * const names[] = {
-	[IPINRECEIVES] = "ipInReceives",
-	[IPINHDRERRORS] = "ipInHdrErrors",
-	[IPINADDRERRORS] = "ipInAddrErrors",
-	[IPINUNKNOWNPROTOS] = "ipInUnknownProtos",
-	[IPINDISCARDS] = "ipInDiscards",
-	[IPINDELIVERS] = "ipInDelivers",
-	[IPOUTREQUESTS] = "ipOutRequests",
-	[IPOUTDISCARDS] = "ipOutDiscards",
-	[IPOUTNOROUTES] = "ipOutNoRoutes",
-	[IPREASMTIMEOUT] = "ipReasmTimeout",
-	[IPREASMREQDS] = "ipReasmReqds",
-	[IPREASMOKS] = "ipReasmOKs",
-	[IPREASMFAILS] = "ipReasmFails",
-	[TCPACTIVEOPENS] = "tcpActiveOpens",
-	[TCPPASSIVEOPENS] = "tcpPassiveOpens",
-	[TCPATTEMPTFAILS] = "tcpAttemptFails",
-	[TCPESTABRESETS] = "tcpEstabResets",
-	[TCPCURRESTAB] = "tcpCurrEstab",
-	[TCPINSEGS] = "tcpInSegs",
-	[TCPOUTSEGS] = "tcpOutSegs",
-	[TCPRETRANSSEGS] = "tcpRetransSegs",
-	[TCPINERRS] = "tcpInErrs",
-	[TCPOUTRSTS] = "tcpOutRsts",
-	[TCPRTOMIN] = "tcpRtoMin",
-	[TCPRTOMAX] = "tcpRtoMax",
-};
-
-static struct rdma_hw_stats *iwch_alloc_stats(struct ib_device *ibdev,
-					      u8 port_num)
-{
-	BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS);
-
-	/* Our driver only supports device level stats */
-	if (port_num != 0)
-		return NULL;
-
-	return rdma_alloc_hw_stats_struct(names, NR_COUNTERS,
-					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
-}
-
-static int iwch_get_mib(struct ib_device *ibdev, struct rdma_hw_stats *stats,
-			u8 port, int index)
-{
-	struct iwch_dev *dev;
-	struct tp_mib_stats m;
-	int ret;
-
-	if (port != 0 || !stats)
-		return -ENOSYS;
-
-	pr_debug("%s ibdev %p\n", __func__, ibdev);
-	dev = to_iwch_dev(ibdev);
-	ret = dev->rdev.t3cdev_p->ctl(dev->rdev.t3cdev_p, RDMA_GET_MIB, &m);
-	if (ret)
-		return -ENOSYS;
-
-	stats->value[IPINRECEIVES] = ((u64)m.ipInReceive_hi << 32) +	m.ipInReceive_lo;
-	stats->value[IPINHDRERRORS] = ((u64)m.ipInHdrErrors_hi << 32) + m.ipInHdrErrors_lo;
-	stats->value[IPINADDRERRORS] = ((u64)m.ipInAddrErrors_hi << 32) + m.ipInAddrErrors_lo;
-	stats->value[IPINUNKNOWNPROTOS] = ((u64)m.ipInUnknownProtos_hi << 32) + m.ipInUnknownProtos_lo;
-	stats->value[IPINDISCARDS] = ((u64)m.ipInDiscards_hi << 32) + m.ipInDiscards_lo;
-	stats->value[IPINDELIVERS] = ((u64)m.ipInDelivers_hi << 32) + m.ipInDelivers_lo;
-	stats->value[IPOUTREQUESTS] = ((u64)m.ipOutRequests_hi << 32) + m.ipOutRequests_lo;
-	stats->value[IPOUTDISCARDS] = ((u64)m.ipOutDiscards_hi << 32) + m.ipOutDiscards_lo;
-	stats->value[IPOUTNOROUTES] = ((u64)m.ipOutNoRoutes_hi << 32) + m.ipOutNoRoutes_lo;
-	stats->value[IPREASMTIMEOUT] = 	m.ipReasmTimeout;
-	stats->value[IPREASMREQDS] = m.ipReasmReqds;
-	stats->value[IPREASMOKS] = m.ipReasmOKs;
-	stats->value[IPREASMFAILS] = m.ipReasmFails;
-	stats->value[TCPACTIVEOPENS] =	m.tcpActiveOpens;
-	stats->value[TCPPASSIVEOPENS] =	m.tcpPassiveOpens;
-	stats->value[TCPATTEMPTFAILS] = m.tcpAttemptFails;
-	stats->value[TCPESTABRESETS] = m.tcpEstabResets;
-	stats->value[TCPCURRESTAB] = m.tcpOutRsts;
-	stats->value[TCPINSEGS] = m.tcpCurrEstab;
-	stats->value[TCPOUTSEGS] = ((u64)m.tcpInSegs_hi << 32) + m.tcpInSegs_lo;
-	stats->value[TCPRETRANSSEGS] = ((u64)m.tcpOutSegs_hi << 32) + m.tcpOutSegs_lo;
-	stats->value[TCPINERRS] = ((u64)m.tcpRetransSeg_hi << 32) + m.tcpRetransSeg_lo,
-	stats->value[TCPOUTRSTS] = ((u64)m.tcpInErrs_hi << 32) + m.tcpInErrs_lo;
-	stats->value[TCPRTOMIN] = m.tcpRtoMin;
-	stats->value[TCPRTOMAX] = m.tcpRtoMax;
-
-	return stats->num_counters;
-}
-
-static struct attribute *iwch_class_attributes[] = {
-	&dev_attr_hw_rev.attr,
-	&dev_attr_hca_type.attr,
-	&dev_attr_board_id.attr,
-	NULL
-};
-
-static const struct attribute_group iwch_attr_group = {
-	.attrs = iwch_class_attributes,
-};
-
-static int iwch_port_immutable(struct ib_device *ibdev, u8 port_num,
-			       struct ib_port_immutable *immutable)
-{
-	struct ib_port_attr attr;
-	int err;
-
-	immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
-
-	err = ib_query_port(ibdev, port_num, &attr);
-	if (err)
-		return err;
-
-	immutable->pkey_tbl_len = attr.pkey_tbl_len;
-	immutable->gid_tbl_len = attr.gid_tbl_len;
-
-	return 0;
-}
-
-static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str)
-{
-	struct iwch_dev *iwch_dev = to_iwch_dev(ibdev);
-	struct ethtool_drvinfo info;
-	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
-
-	pr_debug("%s dev 0x%p\n", __func__, iwch_dev);
-	lldev->ethtool_ops->get_drvinfo(lldev, &info);
-	snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", info.fw_version);
-}
-
-static const struct ib_device_ops iwch_dev_ops = {
-	.owner = THIS_MODULE,
-	.driver_id = RDMA_DRIVER_CXGB3,
-	.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION,
-	.uverbs_no_driver_id_binding = 1,
-
-	.alloc_hw_stats	= iwch_alloc_stats,
-	.alloc_mr = iwch_alloc_mr,
-	.alloc_mw = iwch_alloc_mw,
-	.alloc_pd = iwch_allocate_pd,
-	.alloc_ucontext = iwch_alloc_ucontext,
-	.create_cq = iwch_create_cq,
-	.create_qp = iwch_create_qp,
-	.dealloc_mw = iwch_dealloc_mw,
-	.dealloc_pd = iwch_deallocate_pd,
-	.dealloc_ucontext = iwch_dealloc_ucontext,
-	.dereg_mr = iwch_dereg_mr,
-	.destroy_cq = iwch_destroy_cq,
-	.destroy_qp = iwch_destroy_qp,
-	.get_dev_fw_str = get_dev_fw_ver_str,
-	.get_dma_mr = iwch_get_dma_mr,
-	.get_hw_stats = iwch_get_mib,
-	.get_port_immutable = iwch_port_immutable,
-	.iw_accept = iwch_accept_cr,
-	.iw_add_ref = iwch_qp_add_ref,
-	.iw_connect = iwch_connect,
-	.iw_create_listen = iwch_create_listen,
-	.iw_destroy_listen = iwch_destroy_listen,
-	.iw_get_qp = iwch_get_qp,
-	.iw_reject = iwch_reject_cr,
-	.iw_rem_ref = iwch_qp_rem_ref,
-	.map_mr_sg = iwch_map_mr_sg,
-	.mmap = iwch_mmap,
-	.modify_qp = iwch_ib_modify_qp,
-	.poll_cq = iwch_poll_cq,
-	.post_recv = iwch_post_receive,
-	.post_send = iwch_post_send,
-	.query_device = iwch_query_device,
-	.query_gid = iwch_query_gid,
-	.query_pkey = iwch_query_pkey,
-	.query_port = iwch_query_port,
-	.reg_user_mr = iwch_reg_user_mr,
-	.req_notify_cq = iwch_arm_cq,
-	INIT_RDMA_OBJ_SIZE(ib_pd, iwch_pd, ibpd),
-	INIT_RDMA_OBJ_SIZE(ib_cq, iwch_cq, ibcq),
-	INIT_RDMA_OBJ_SIZE(ib_ucontext, iwch_ucontext, ibucontext),
-};
-
-static int set_netdevs(struct ib_device *ib_dev, struct cxio_rdev *rdev)
-{
-	int ret;
-	int i;
-
-	for (i = 0; i < rdev->port_info.nports; i++) {
-		ret = ib_device_set_netdev(ib_dev, rdev->port_info.lldevs[i],
-					   i + 1);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
-int iwch_register_device(struct iwch_dev *dev)
-{
-	int err;
-
-	pr_debug("%s iwch_dev %p\n", __func__, dev);
-	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
-	memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
-	dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
-				IB_DEVICE_MEM_WINDOW |
-				IB_DEVICE_MEM_MGT_EXTENSIONS;
-
-	/* cxgb3 supports STag 0. */
-	dev->ibdev.local_dma_lkey = 0;
-
-	dev->ibdev.uverbs_cmd_mask =
-	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
-	    (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
-	    (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
-	    (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
-	    (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
-	    (1ull << IB_USER_VERBS_CMD_REG_MR) |
-	    (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
-	    (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
-	    (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
-	    (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
-	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
-	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
-	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
-	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
-	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
-	    (1ull << IB_USER_VERBS_CMD_POST_SEND) |
-	    (1ull << IB_USER_VERBS_CMD_POST_RECV);
-	dev->ibdev.node_type = RDMA_NODE_RNIC;
-	BUILD_BUG_ON(sizeof(IWCH_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX);
-	memcpy(dev->ibdev.node_desc, IWCH_NODE_DESC, sizeof(IWCH_NODE_DESC));
-	dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports;
-	dev->ibdev.num_comp_vectors = 1;
-	dev->ibdev.dev.parent = &dev->rdev.rnic_info.pdev->dev;
-
-	memcpy(dev->ibdev.iw_ifname, dev->rdev.t3cdev_p->lldev->name,
-	       sizeof(dev->ibdev.iw_ifname));
-
-	rdma_set_device_sysfs_group(&dev->ibdev, &iwch_attr_group);
-	ib_set_device_ops(&dev->ibdev, &iwch_dev_ops);
-	err = set_netdevs(&dev->ibdev, &dev->rdev);
-	if (err)
-		return err;
-
-	return ib_register_device(&dev->ibdev, "cxgb3_%d");
-}
-
-void iwch_unregister_device(struct iwch_dev *dev)
-{
-	pr_debug("%s iwch_dev %p\n", __func__, dev);
-	ib_unregister_device(&dev->ibdev);
-	return;
-}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h
deleted file mode 100644
index 8adbe96..0000000
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.h
+++ /dev/null
@@ -1,347 +0,0 @@
-/*
- * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef __IWCH_PROVIDER_H__
-#define __IWCH_PROVIDER_H__
-
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <rdma/ib_verbs.h>
-#include <asm/types.h>
-#include "t3cdev.h"
-#include "iwch.h"
-#include "cxio_wr.h"
-#include "cxio_hal.h"
-
-struct iwch_pd {
-	struct ib_pd ibpd;
-	u32 pdid;
-	struct iwch_dev *rhp;
-};
-
-static inline struct iwch_pd *to_iwch_pd(struct ib_pd *ibpd)
-{
-	return container_of(ibpd, struct iwch_pd, ibpd);
-}
-
-struct tpt_attributes {
-	u32 stag;
-	u32 state:1;
-	u32 type:2;
-	u32 rsvd:1;
-	enum tpt_mem_perm perms;
-	u32 remote_invaliate_disable:1;
-	u32 zbva:1;
-	u32 mw_bind_enable:1;
-	u32 page_size:5;
-
-	u32 pdid;
-	u32 qpid;
-	u32 pbl_addr;
-	u32 len;
-	u64 va_fbo;
-	u32 pbl_size;
-};
-
-struct iwch_mr {
-	struct ib_mr ibmr;
-	struct ib_umem *umem;
-	struct iwch_dev *rhp;
-	u64 kva;
-	struct tpt_attributes attr;
-	u64 *pages;
-	u32 npages;
-};
-
-typedef struct iwch_mw iwch_mw_handle;
-
-static inline struct iwch_mr *to_iwch_mr(struct ib_mr *ibmr)
-{
-	return container_of(ibmr, struct iwch_mr, ibmr);
-}
-
-struct iwch_mw {
-	struct ib_mw ibmw;
-	struct iwch_dev *rhp;
-	u64 kva;
-	struct tpt_attributes attr;
-};
-
-static inline struct iwch_mw *to_iwch_mw(struct ib_mw *ibmw)
-{
-	return container_of(ibmw, struct iwch_mw, ibmw);
-}
-
-struct iwch_cq {
-	struct ib_cq ibcq;
-	struct iwch_dev *rhp;
-	struct t3_cq cq;
-	spinlock_t lock;
-	spinlock_t comp_handler_lock;
-	atomic_t refcnt;
-	wait_queue_head_t wait;
-	u32 __user *user_rptr_addr;
-};
-
-static inline struct iwch_cq *to_iwch_cq(struct ib_cq *ibcq)
-{
-	return container_of(ibcq, struct iwch_cq, ibcq);
-}
-
-enum IWCH_QP_FLAGS {
-	QP_QUIESCED = 0x01
-};
-
-struct iwch_mpa_attributes {
-	u8 initiator;
-	u8 recv_marker_enabled;
-	u8 xmit_marker_enabled;	/* iWARP: enable inbound Read Resp. */
-	u8 crc_enabled;
-	u8 version;	/* 0 or 1 */
-};
-
-struct iwch_qp_attributes {
-	u32 scq;
-	u32 rcq;
-	u32 sq_num_entries;
-	u32 rq_num_entries;
-	u32 sq_max_sges;
-	u32 sq_max_sges_rdma_write;
-	u32 rq_max_sges;
-	u32 state;
-	u8 enable_rdma_read;
-	u8 enable_rdma_write;	/* enable inbound Read Resp. */
-	u8 enable_bind;
-	u8 enable_mmid0_fastreg;	/* Enable STAG0 + Fast-register */
-	/*
-	 * Next QP state. If specify the current state, only the
-	 * QP attributes will be modified.
-	 */
-	u32 max_ord;
-	u32 max_ird;
-	u32 pd;	/* IN */
-	u32 next_state;
-	char terminate_buffer[52];
-	u32 terminate_msg_len;
-	u8 is_terminate_local;
-	struct iwch_mpa_attributes mpa_attr;	/* IN-OUT */
-	struct iwch_ep *llp_stream_handle;
-	char *stream_msg_buf;	/* Last stream msg. before Idle -> RTS */
-	u32 stream_msg_buf_len;	/* Only on Idle -> RTS */
-};
-
-struct iwch_qp {
-	struct ib_qp ibqp;
-	struct iwch_dev *rhp;
-	struct iwch_ep *ep;
-	struct iwch_qp_attributes attr;
-	struct t3_wq wq;
-	spinlock_t lock;
-	atomic_t refcnt;
-	wait_queue_head_t wait;
-	enum IWCH_QP_FLAGS flags;
-};
-
-static inline int qp_quiesced(struct iwch_qp *qhp)
-{
-	return qhp->flags & QP_QUIESCED;
-}
-
-static inline struct iwch_qp *to_iwch_qp(struct ib_qp *ibqp)
-{
-	return container_of(ibqp, struct iwch_qp, ibqp);
-}
-
-void iwch_qp_add_ref(struct ib_qp *qp);
-void iwch_qp_rem_ref(struct ib_qp *qp);
-
-struct iwch_ucontext {
-	struct ib_ucontext ibucontext;
-	struct cxio_ucontext uctx;
-	u32 key;
-	spinlock_t mmap_lock;
-	struct list_head mmaps;
-};
-
-static inline struct iwch_ucontext *to_iwch_ucontext(struct ib_ucontext *c)
-{
-	return container_of(c, struct iwch_ucontext, ibucontext);
-}
-
-struct iwch_mm_entry {
-	struct list_head entry;
-	u64 addr;
-	u32 key;
-	unsigned len;
-};
-
-static inline struct iwch_mm_entry *remove_mmap(struct iwch_ucontext *ucontext,
-						u32 key, unsigned len)
-{
-	struct list_head *pos, *nxt;
-	struct iwch_mm_entry *mm;
-
-	spin_lock(&ucontext->mmap_lock);
-	list_for_each_safe(pos, nxt, &ucontext->mmaps) {
-
-		mm = list_entry(pos, struct iwch_mm_entry, entry);
-		if (mm->key == key && mm->len == len) {
-			list_del_init(&mm->entry);
-			spin_unlock(&ucontext->mmap_lock);
-			pr_debug("%s key 0x%x addr 0x%llx len %d\n",
-				 __func__, key,
-				 (unsigned long long)mm->addr, mm->len);
-			return mm;
-		}
-	}
-	spin_unlock(&ucontext->mmap_lock);
-	return NULL;
-}
-
-static inline void insert_mmap(struct iwch_ucontext *ucontext,
-			       struct iwch_mm_entry *mm)
-{
-	spin_lock(&ucontext->mmap_lock);
-	pr_debug("%s key 0x%x addr 0x%llx len %d\n",
-		 __func__, mm->key, (unsigned long long)mm->addr, mm->len);
-	list_add_tail(&mm->entry, &ucontext->mmaps);
-	spin_unlock(&ucontext->mmap_lock);
-}
-
-enum iwch_qp_attr_mask {
-	IWCH_QP_ATTR_NEXT_STATE = 1 << 0,
-	IWCH_QP_ATTR_ENABLE_RDMA_READ = 1 << 7,
-	IWCH_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8,
-	IWCH_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9,
-	IWCH_QP_ATTR_MAX_ORD = 1 << 11,
-	IWCH_QP_ATTR_MAX_IRD = 1 << 12,
-	IWCH_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22,
-	IWCH_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23,
-	IWCH_QP_ATTR_MPA_ATTR = 1 << 24,
-	IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25,
-	IWCH_QP_ATTR_VALID_MODIFY = (IWCH_QP_ATTR_ENABLE_RDMA_READ |
-				     IWCH_QP_ATTR_ENABLE_RDMA_WRITE |
-				     IWCH_QP_ATTR_MAX_ORD |
-				     IWCH_QP_ATTR_MAX_IRD |
-				     IWCH_QP_ATTR_LLP_STREAM_HANDLE |
-				     IWCH_QP_ATTR_STREAM_MSG_BUFFER |
-				     IWCH_QP_ATTR_MPA_ATTR |
-				     IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE)
-};
-
-int iwch_modify_qp(struct iwch_dev *rhp,
-				struct iwch_qp *qhp,
-				enum iwch_qp_attr_mask mask,
-				struct iwch_qp_attributes *attrs,
-				int internal);
-
-enum iwch_qp_state {
-	IWCH_QP_STATE_IDLE,
-	IWCH_QP_STATE_RTS,
-	IWCH_QP_STATE_ERROR,
-	IWCH_QP_STATE_TERMINATE,
-	IWCH_QP_STATE_CLOSING,
-	IWCH_QP_STATE_TOT
-};
-
-static inline int iwch_convert_state(enum ib_qp_state ib_state)
-{
-	switch (ib_state) {
-	case IB_QPS_RESET:
-	case IB_QPS_INIT:
-		return IWCH_QP_STATE_IDLE;
-	case IB_QPS_RTS:
-		return IWCH_QP_STATE_RTS;
-	case IB_QPS_SQD:
-		return IWCH_QP_STATE_CLOSING;
-	case IB_QPS_SQE:
-		return IWCH_QP_STATE_TERMINATE;
-	case IB_QPS_ERR:
-		return IWCH_QP_STATE_ERROR;
-	default:
-		return -1;
-	}
-}
-
-static inline u32 iwch_ib_to_tpt_access(int acc)
-{
-	return (acc & IB_ACCESS_REMOTE_WRITE ? TPT_REMOTE_WRITE : 0) |
-	       (acc & IB_ACCESS_REMOTE_READ ? TPT_REMOTE_READ : 0) |
-	       (acc & IB_ACCESS_LOCAL_WRITE ? TPT_LOCAL_WRITE : 0) |
-	       (acc & IB_ACCESS_MW_BIND ? TPT_MW_BIND : 0) |
-	       TPT_LOCAL_READ;
-}
-
-static inline u32 iwch_ib_to_tpt_bind_access(int acc)
-{
-	return (acc & IB_ACCESS_REMOTE_WRITE ? TPT_REMOTE_WRITE : 0) |
-	       (acc & IB_ACCESS_REMOTE_READ ? TPT_REMOTE_READ : 0);
-}
-
-enum iwch_mmid_state {
-	IWCH_STAG_STATE_VALID,
-	IWCH_STAG_STATE_INVALID
-};
-
-enum iwch_qp_query_flags {
-	IWCH_QP_QUERY_CONTEXT_NONE = 0x0,	/* No ctx; Only attrs */
-	IWCH_QP_QUERY_CONTEXT_GET = 0x1,	/* Get ctx + attrs */
-	IWCH_QP_QUERY_CONTEXT_SUSPEND = 0x2,	/* Not Supported */
-
-	/*
-	 * Quiesce QP context; Consumer
-	 * will NOT replay outstanding WR
-	 */
-	IWCH_QP_QUERY_CONTEXT_QUIESCE = 0x4,
-	IWCH_QP_QUERY_CONTEXT_REMOVE = 0x8,
-	IWCH_QP_QUERY_TEST_USERWRITE = 0x32	/* Test special */
-};
-
-u16 iwch_rqes_posted(struct iwch_qp *qhp);
-int iwch_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
-		   const struct ib_send_wr **bad_wr);
-int iwch_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
-		      const struct ib_recv_wr **bad_wr);
-int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
-int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg);
-int iwch_post_zb_read(struct iwch_ep *ep);
-int iwch_register_device(struct iwch_dev *dev);
-void iwch_unregister_device(struct iwch_dev *dev);
-void stop_read_rep_timer(struct iwch_qp *qhp);
-int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
-		      struct iwch_mr *mhp, int shift);
-int iwch_alloc_pbl(struct iwch_mr *mhp, int npages);
-void iwch_free_pbl(struct iwch_mr *mhp);
-int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset);
-
-#define IWCH_NODE_DESC "cxgb3 Chelsio Communications"
-
-#endif
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
deleted file mode 100644
index c649faa..0000000
--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ /dev/null
@@ -1,1082 +0,0 @@
-/*
- * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <linux/sched.h>
-#include <linux/gfp.h>
-#include "iwch_provider.h"
-#include "iwch.h"
-#include "iwch_cm.h"
-#include "cxio_hal.h"
-#include "cxio_resource.h"
-
-#define NO_SUPPORT -1
-
-static int build_rdma_send(union t3_wr *wqe, const struct ib_send_wr *wr,
-			   u8 *flit_cnt)
-{
-	int i;
-	u32 plen;
-
-	switch (wr->opcode) {
-	case IB_WR_SEND:
-		if (wr->send_flags & IB_SEND_SOLICITED)
-			wqe->send.rdmaop = T3_SEND_WITH_SE;
-		else
-			wqe->send.rdmaop = T3_SEND;
-		wqe->send.rem_stag = 0;
-		break;
-	case IB_WR_SEND_WITH_INV:
-		if (wr->send_flags & IB_SEND_SOLICITED)
-			wqe->send.rdmaop = T3_SEND_WITH_SE_INV;
-		else
-			wqe->send.rdmaop = T3_SEND_WITH_INV;
-		wqe->send.rem_stag = cpu_to_be32(wr->ex.invalidate_rkey);
-		break;
-	default:
-		return -EINVAL;
-	}
-	if (wr->num_sge > T3_MAX_SGE)
-		return -EINVAL;
-	wqe->send.reserved[0] = 0;
-	wqe->send.reserved[1] = 0;
-	wqe->send.reserved[2] = 0;
-	plen = 0;
-	for (i = 0; i < wr->num_sge; i++) {
-		if ((plen + wr->sg_list[i].length) < plen)
-			return -EMSGSIZE;
-
-		plen += wr->sg_list[i].length;
-		wqe->send.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey);
-		wqe->send.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
-		wqe->send.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
-	}
-	wqe->send.num_sgle = cpu_to_be32(wr->num_sge);
-	*flit_cnt = 4 + ((wr->num_sge) << 1);
-	wqe->send.plen = cpu_to_be32(plen);
-	return 0;
-}
-
-static int build_rdma_write(union t3_wr *wqe, const struct ib_send_wr *wr,
-			    u8 *flit_cnt)
-{
-	int i;
-	u32 plen;
-	if (wr->num_sge > T3_MAX_SGE)
-		return -EINVAL;
-	wqe->write.rdmaop = T3_RDMA_WRITE;
-	wqe->write.reserved[0] = 0;
-	wqe->write.reserved[1] = 0;
-	wqe->write.reserved[2] = 0;
-	wqe->write.stag_sink = cpu_to_be32(rdma_wr(wr)->rkey);
-	wqe->write.to_sink = cpu_to_be64(rdma_wr(wr)->remote_addr);
-
-	if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
-		plen = 4;
-		wqe->write.sgl[0].stag = wr->ex.imm_data;
-		wqe->write.sgl[0].len = cpu_to_be32(0);
-		wqe->write.num_sgle = cpu_to_be32(0);
-		*flit_cnt = 6;
-	} else {
-		plen = 0;
-		for (i = 0; i < wr->num_sge; i++) {
-			if ((plen + wr->sg_list[i].length) < plen) {
-				return -EMSGSIZE;
-			}
-			plen += wr->sg_list[i].length;
-			wqe->write.sgl[i].stag =
-			    cpu_to_be32(wr->sg_list[i].lkey);
-			wqe->write.sgl[i].len =
-			    cpu_to_be32(wr->sg_list[i].length);
-			wqe->write.sgl[i].to =
-			    cpu_to_be64(wr->sg_list[i].addr);
-		}
-		wqe->write.num_sgle = cpu_to_be32(wr->num_sge);
-		*flit_cnt = 5 + ((wr->num_sge) << 1);
-	}
-	wqe->write.plen = cpu_to_be32(plen);
-	return 0;
-}
-
-static int build_rdma_read(union t3_wr *wqe, const struct ib_send_wr *wr,
-			   u8 *flit_cnt)
-{
-	if (wr->num_sge > 1)
-		return -EINVAL;
-	wqe->read.rdmaop = T3_READ_REQ;
-	if (wr->opcode == IB_WR_RDMA_READ_WITH_INV)
-		wqe->read.local_inv = 1;
-	else
-		wqe->read.local_inv = 0;
-	wqe->read.reserved[0] = 0;
-	wqe->read.reserved[1] = 0;
-	wqe->read.rem_stag = cpu_to_be32(rdma_wr(wr)->rkey);
-	wqe->read.rem_to = cpu_to_be64(rdma_wr(wr)->remote_addr);
-	wqe->read.local_stag = cpu_to_be32(wr->sg_list[0].lkey);
-	wqe->read.local_len = cpu_to_be32(wr->sg_list[0].length);
-	wqe->read.local_to = cpu_to_be64(wr->sg_list[0].addr);
-	*flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3;
-	return 0;
-}
-
-static int build_memreg(union t3_wr *wqe, const struct ib_reg_wr *wr,
-			u8 *flit_cnt, int *wr_cnt, struct t3_wq *wq)
-{
-	struct iwch_mr *mhp = to_iwch_mr(wr->mr);
-	int i;
-	__be64 *p;
-
-	if (mhp->npages > T3_MAX_FASTREG_DEPTH)
-		return -EINVAL;
-	*wr_cnt = 1;
-	wqe->fastreg.stag = cpu_to_be32(wr->key);
-	wqe->fastreg.len = cpu_to_be32(mhp->ibmr.length);
-	wqe->fastreg.va_base_hi = cpu_to_be32(mhp->ibmr.iova >> 32);
-	wqe->fastreg.va_base_lo_fbo =
-				cpu_to_be32(mhp->ibmr.iova & 0xffffffff);
-	wqe->fastreg.page_type_perms = cpu_to_be32(
-		V_FR_PAGE_COUNT(mhp->npages) |
-		V_FR_PAGE_SIZE(ilog2(wr->mr->page_size) - 12) |
-		V_FR_TYPE(TPT_VATO) |
-		V_FR_PERMS(iwch_ib_to_tpt_access(wr->access)));
-	p = &wqe->fastreg.pbl_addrs[0];
-	for (i = 0; i < mhp->npages; i++, p++) {
-
-		/* If we need a 2nd WR, then set it up */
-		if (i == T3_MAX_FASTREG_FRAG) {
-			*wr_cnt = 2;
-			wqe = (union t3_wr *)(wq->queue +
-				Q_PTR2IDX((wq->wptr+1), wq->size_log2));
-			build_fw_riwrh((void *)wqe, T3_WR_FASTREG, 0,
-			       Q_GENBIT(wq->wptr + 1, wq->size_log2),
-			       0, 1 + mhp->npages - T3_MAX_FASTREG_FRAG,
-			       T3_EOP);
-
-			p = &wqe->pbl_frag.pbl_addrs[0];
-		}
-		*p = cpu_to_be64((u64)mhp->pages[i]);
-	}
-	*flit_cnt = 5 + mhp->npages;
-	if (*flit_cnt > 15)
-		*flit_cnt = 15;
-	return 0;
-}
-
-static int build_inv_stag(union t3_wr *wqe, const struct ib_send_wr *wr,
-			  u8 *flit_cnt)
-{
-	wqe->local_inv.stag = cpu_to_be32(wr->ex.invalidate_rkey);
-	wqe->local_inv.reserved = 0;
-	*flit_cnt = sizeof(struct t3_local_inv_wr) >> 3;
-	return 0;
-}
-
-static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list,
-			    u32 num_sgle, u32 * pbl_addr, u8 * page_size)
-{
-	int i;
-	struct iwch_mr *mhp;
-	u64 offset;
-	for (i = 0; i < num_sgle; i++) {
-
-		mhp = get_mhp(rhp, (sg_list[i].lkey) >> 8);
-		if (!mhp) {
-			pr_debug("%s %d\n", __func__, __LINE__);
-			return -EIO;
-		}
-		if (!mhp->attr.state) {
-			pr_debug("%s %d\n", __func__, __LINE__);
-			return -EIO;
-		}
-		if (mhp->attr.zbva) {
-			pr_debug("%s %d\n", __func__, __LINE__);
-			return -EIO;
-		}
-
-		if (sg_list[i].addr < mhp->attr.va_fbo) {
-			pr_debug("%s %d\n", __func__, __LINE__);
-			return -EINVAL;
-		}
-		if (sg_list[i].addr + ((u64) sg_list[i].length) <
-		    sg_list[i].addr) {
-			pr_debug("%s %d\n", __func__, __LINE__);
-			return -EINVAL;
-		}
-		if (sg_list[i].addr + ((u64) sg_list[i].length) >
-		    mhp->attr.va_fbo + ((u64) mhp->attr.len)) {
-			pr_debug("%s %d\n", __func__, __LINE__);
-			return -EINVAL;
-		}
-		offset = sg_list[i].addr - mhp->attr.va_fbo;
-		offset += mhp->attr.va_fbo &
-			  ((1UL << (12 + mhp->attr.page_size)) - 1);
-		pbl_addr[i] = ((mhp->attr.pbl_addr -
-			        rhp->rdev.rnic_info.pbl_base) >> 3) +
-			      (offset >> (12 + mhp->attr.page_size));
-		page_size[i] = mhp->attr.page_size;
-	}
-	return 0;
-}
-
-static int build_rdma_recv(struct iwch_qp *qhp, union t3_wr *wqe,
-			   const struct ib_recv_wr *wr)
-{
-	int i, err = 0;
-	u32 pbl_addr[T3_MAX_SGE];
-	u8 page_size[T3_MAX_SGE];
-
-	err = iwch_sgl2pbl_map(qhp->rhp, wr->sg_list, wr->num_sge, pbl_addr,
-			       page_size);
-	if (err)
-		return err;
-	wqe->recv.pagesz[0] = page_size[0];
-	wqe->recv.pagesz[1] = page_size[1];
-	wqe->recv.pagesz[2] = page_size[2];
-	wqe->recv.pagesz[3] = page_size[3];
-	wqe->recv.num_sgle = cpu_to_be32(wr->num_sge);
-	for (i = 0; i < wr->num_sge; i++) {
-		wqe->recv.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey);
-		wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
-
-		/* to in the WQE == the offset into the page */
-		wqe->recv.sgl[i].to = cpu_to_be64(((u32)wr->sg_list[i].addr) &
-				((1UL << (12 + page_size[i])) - 1));
-
-		/* pbl_addr is the adapters address in the PBL */
-		wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_addr[i]);
-	}
-	for (; i < T3_MAX_SGE; i++) {
-		wqe->recv.sgl[i].stag = 0;
-		wqe->recv.sgl[i].len = 0;
-		wqe->recv.sgl[i].to = 0;
-		wqe->recv.pbl_addr[i] = 0;
-	}
-	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
-			     qhp->wq.rq_size_log2)].wr_id = wr->wr_id;
-	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
-			     qhp->wq.rq_size_log2)].pbl_addr = 0;
-	return 0;
-}
-
-static int build_zero_stag_recv(struct iwch_qp *qhp, union t3_wr *wqe,
-				const struct ib_recv_wr *wr)
-{
-	int i;
-	u32 pbl_addr;
-	u32 pbl_offset;
-
-
-	/*
-	 * The T3 HW requires the PBL in the HW recv descriptor to reference
-	 * a PBL entry.  So we allocate the max needed PBL memory here and pass
-	 * it to the uP in the recv WR.  The uP will build the PBL and setup
-	 * the HW recv descriptor.
-	 */
-	pbl_addr = cxio_hal_pblpool_alloc(&qhp->rhp->rdev, T3_STAG0_PBL_SIZE);
-	if (!pbl_addr)
-		return -ENOMEM;
-
-	/*
-	 * Compute the 8B aligned offset.
-	 */
-	pbl_offset = (pbl_addr - qhp->rhp->rdev.rnic_info.pbl_base) >> 3;
-
-	wqe->recv.num_sgle = cpu_to_be32(wr->num_sge);
-
-	for (i = 0; i < wr->num_sge; i++) {
-
-		/*
-		 * Use a 128MB page size. This and an imposed 128MB
-		 * sge length limit allows us to require only a 2-entry HW
-		 * PBL for each SGE.  This restriction is acceptable since
-		 * since it is not possible to allocate 128MB of contiguous
-		 * DMA coherent memory!
-		 */
-		if (wr->sg_list[i].length > T3_STAG0_MAX_PBE_LEN)
-			return -EINVAL;
-		wqe->recv.pagesz[i] = T3_STAG0_PAGE_SHIFT;
-
-		/*
-		 * T3 restricts a recv to all zero-stag or all non-zero-stag.
-		 */
-		if (wr->sg_list[i].lkey != 0)
-			return -EINVAL;
-		wqe->recv.sgl[i].stag = 0;
-		wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
-		wqe->recv.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
-		wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_offset);
-		pbl_offset += 2;
-	}
-	for (; i < T3_MAX_SGE; i++) {
-		wqe->recv.pagesz[i] = 0;
-		wqe->recv.sgl[i].stag = 0;
-		wqe->recv.sgl[i].len = 0;
-		wqe->recv.sgl[i].to = 0;
-		wqe->recv.pbl_addr[i] = 0;
-	}
-	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
-			     qhp->wq.rq_size_log2)].wr_id = wr->wr_id;
-	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
-			     qhp->wq.rq_size_log2)].pbl_addr = pbl_addr;
-	return 0;
-}
-
-int iwch_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
-		   const struct ib_send_wr **bad_wr)
-{
-	int err = 0;
-	u8 uninitialized_var(t3_wr_flit_cnt);
-	enum t3_wr_opcode t3_wr_opcode = 0;
-	enum t3_wr_flags t3_wr_flags;
-	struct iwch_qp *qhp;
-	u32 idx;
-	union t3_wr *wqe;
-	u32 num_wrs;
-	unsigned long flag;
-	struct t3_swsq *sqp;
-	int wr_cnt = 1;
-
-	qhp = to_iwch_qp(ibqp);
-	spin_lock_irqsave(&qhp->lock, flag);
-	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
-		spin_unlock_irqrestore(&qhp->lock, flag);
-		err = -EINVAL;
-		goto out;
-	}
-	num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr,
-		  qhp->wq.sq_size_log2);
-	if (num_wrs == 0) {
-		spin_unlock_irqrestore(&qhp->lock, flag);
-		err = -ENOMEM;
-		goto out;
-	}
-	while (wr) {
-		if (num_wrs == 0) {
-			err = -ENOMEM;
-			break;
-		}
-		idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
-		wqe = (union t3_wr *) (qhp->wq.queue + idx);
-		t3_wr_flags = 0;
-		if (wr->send_flags & IB_SEND_SOLICITED)
-			t3_wr_flags |= T3_SOLICITED_EVENT_FLAG;
-		if (wr->send_flags & IB_SEND_SIGNALED)
-			t3_wr_flags |= T3_COMPLETION_FLAG;
-		sqp = qhp->wq.sq +
-		      Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
-		switch (wr->opcode) {
-		case IB_WR_SEND:
-		case IB_WR_SEND_WITH_INV:
-			if (wr->send_flags & IB_SEND_FENCE)
-				t3_wr_flags |= T3_READ_FENCE_FLAG;
-			t3_wr_opcode = T3_WR_SEND;
-			err = build_rdma_send(wqe, wr, &t3_wr_flit_cnt);
-			break;
-		case IB_WR_RDMA_WRITE:
-		case IB_WR_RDMA_WRITE_WITH_IMM:
-			t3_wr_opcode = T3_WR_WRITE;
-			err = build_rdma_write(wqe, wr, &t3_wr_flit_cnt);
-			break;
-		case IB_WR_RDMA_READ:
-		case IB_WR_RDMA_READ_WITH_INV:
-			t3_wr_opcode = T3_WR_READ;
-			t3_wr_flags = 0; /* T3 reads are always signaled */
-			err = build_rdma_read(wqe, wr, &t3_wr_flit_cnt);
-			if (err)
-				break;
-			sqp->read_len = wqe->read.local_len;
-			if (!qhp->wq.oldest_read)
-				qhp->wq.oldest_read = sqp;
-			break;
-		case IB_WR_REG_MR:
-			t3_wr_opcode = T3_WR_FASTREG;
-			err = build_memreg(wqe, reg_wr(wr), &t3_wr_flit_cnt,
-					   &wr_cnt, &qhp->wq);
-			break;
-		case IB_WR_LOCAL_INV:
-			if (wr->send_flags & IB_SEND_FENCE)
-				t3_wr_flags |= T3_LOCAL_FENCE_FLAG;
-			t3_wr_opcode = T3_WR_INV_STAG;
-			err = build_inv_stag(wqe, wr, &t3_wr_flit_cnt);
-			break;
-		default:
-			pr_debug("%s post of type=%d TBD!\n", __func__,
-				 wr->opcode);
-			err = -EINVAL;
-		}
-		if (err)
-			break;
-		wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
-		sqp->wr_id = wr->wr_id;
-		sqp->opcode = wr2opcode(t3_wr_opcode);
-		sqp->sq_wptr = qhp->wq.sq_wptr;
-		sqp->complete = 0;
-		sqp->signaled = (wr->send_flags & IB_SEND_SIGNALED);
-
-		build_fw_riwrh((void *) wqe, t3_wr_opcode, t3_wr_flags,
-			       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
-			       0, t3_wr_flit_cnt,
-			       (wr_cnt == 1) ? T3_SOPEOP : T3_SOP);
-		pr_debug("%s cookie 0x%llx wq idx 0x%x swsq idx %ld opcode %d\n",
-			 __func__, (unsigned long long)wr->wr_id, idx,
-			 Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2),
-			 sqp->opcode);
-		wr = wr->next;
-		num_wrs--;
-		qhp->wq.wptr += wr_cnt;
-		++(qhp->wq.sq_wptr);
-	}
-	spin_unlock_irqrestore(&qhp->lock, flag);
-	if (cxio_wq_db_enabled(&qhp->wq))
-		ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
-
-out:
-	if (err)
-		*bad_wr = wr;
-	return err;
-}
-
-int iwch_post_receive(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
-		      const struct ib_recv_wr **bad_wr)
-{
-	int err = 0;
-	struct iwch_qp *qhp;
-	u32 idx;
-	union t3_wr *wqe;
-	u32 num_wrs;
-	unsigned long flag;
-
-	qhp = to_iwch_qp(ibqp);
-	spin_lock_irqsave(&qhp->lock, flag);
-	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
-		spin_unlock_irqrestore(&qhp->lock, flag);
-		err = -EINVAL;
-		goto out;
-	}
-	num_wrs = Q_FREECNT(qhp->wq.rq_rptr, qhp->wq.rq_wptr,
-			    qhp->wq.rq_size_log2) - 1;
-	if (!wr) {
-		spin_unlock_irqrestore(&qhp->lock, flag);
-		err = -ENOMEM;
-		goto out;
-	}
-	while (wr) {
-		if (wr->num_sge > T3_MAX_SGE) {
-			err = -EINVAL;
-			break;
-		}
-		idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
-		wqe = (union t3_wr *) (qhp->wq.queue + idx);
-		if (num_wrs)
-			if (wr->sg_list[0].lkey)
-				err = build_rdma_recv(qhp, wqe, wr);
-			else
-				err = build_zero_stag_recv(qhp, wqe, wr);
-		else
-			err = -ENOMEM;
-
-		if (err)
-			break;
-
-		build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG,
-			       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
-			       0, sizeof(struct t3_receive_wr) >> 3, T3_SOPEOP);
-		pr_debug("%s cookie 0x%llx idx 0x%x rq_wptr 0x%x rw_rptr 0x%x wqe %p\n",
-			 __func__, (unsigned long long)wr->wr_id,
-			 idx, qhp->wq.rq_wptr, qhp->wq.rq_rptr, wqe);
-		++(qhp->wq.rq_wptr);
-		++(qhp->wq.wptr);
-		wr = wr->next;
-		num_wrs--;
-	}
-	spin_unlock_irqrestore(&qhp->lock, flag);
-	if (cxio_wq_db_enabled(&qhp->wq))
-		ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
-
-out:
-	if (err)
-		*bad_wr = wr;
-	return err;
-}
-
-static inline void build_term_codes(struct respQ_msg_t *rsp_msg,
-				    u8 *layer_type, u8 *ecode)
-{
-	int status = TPT_ERR_INTERNAL_ERR;
-	int tagged = 0;
-	int opcode = -1;
-	int rqtype = 0;
-	int send_inv = 0;
-
-	if (rsp_msg) {
-		status = CQE_STATUS(rsp_msg->cqe);
-		opcode = CQE_OPCODE(rsp_msg->cqe);
-		rqtype = RQ_TYPE(rsp_msg->cqe);
-		send_inv = (opcode == T3_SEND_WITH_INV) ||
-		           (opcode == T3_SEND_WITH_SE_INV);
-		tagged = (opcode == T3_RDMA_WRITE) ||
-			 (rqtype && (opcode == T3_READ_RESP));
-	}
-
-	switch (status) {
-	case TPT_ERR_STAG:
-		if (send_inv) {
-			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
-			*ecode = RDMAP_CANT_INV_STAG;
-		} else {
-			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
-			*ecode = RDMAP_INV_STAG;
-		}
-		break;
-	case TPT_ERR_PDID:
-		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
-		if ((opcode == T3_SEND_WITH_INV) ||
-		    (opcode == T3_SEND_WITH_SE_INV))
-			*ecode = RDMAP_CANT_INV_STAG;
-		else
-			*ecode = RDMAP_STAG_NOT_ASSOC;
-		break;
-	case TPT_ERR_QPID:
-		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
-		*ecode = RDMAP_STAG_NOT_ASSOC;
-		break;
-	case TPT_ERR_ACCESS:
-		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
-		*ecode = RDMAP_ACC_VIOL;
-		break;
-	case TPT_ERR_WRAP:
-		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
-		*ecode = RDMAP_TO_WRAP;
-		break;
-	case TPT_ERR_BOUND:
-		if (tagged) {
-			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
-			*ecode = DDPT_BASE_BOUNDS;
-		} else {
-			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
-			*ecode = RDMAP_BASE_BOUNDS;
-		}
-		break;
-	case TPT_ERR_INVALIDATE_SHARED_MR:
-	case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
-		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
-		*ecode = RDMAP_CANT_INV_STAG;
-		break;
-	case TPT_ERR_ECC:
-	case TPT_ERR_ECC_PSTAG:
-	case TPT_ERR_INTERNAL_ERR:
-		*layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA;
-		*ecode = 0;
-		break;
-	case TPT_ERR_OUT_OF_RQE:
-		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
-		*ecode = DDPU_INV_MSN_NOBUF;
-		break;
-	case TPT_ERR_PBL_ADDR_BOUND:
-		*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
-		*ecode = DDPT_BASE_BOUNDS;
-		break;
-	case TPT_ERR_CRC:
-		*layer_type = LAYER_MPA|DDP_LLP;
-		*ecode = MPA_CRC_ERR;
-		break;
-	case TPT_ERR_MARKER:
-		*layer_type = LAYER_MPA|DDP_LLP;
-		*ecode = MPA_MARKER_ERR;
-		break;
-	case TPT_ERR_PDU_LEN_ERR:
-		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
-		*ecode = DDPU_MSG_TOOBIG;
-		break;
-	case TPT_ERR_DDP_VERSION:
-		if (tagged) {
-			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
-			*ecode = DDPT_INV_VERS;
-		} else {
-			*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
-			*ecode = DDPU_INV_VERS;
-		}
-		break;
-	case TPT_ERR_RDMA_VERSION:
-		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
-		*ecode = RDMAP_INV_VERS;
-		break;
-	case TPT_ERR_OPCODE:
-		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
-		*ecode = RDMAP_INV_OPCODE;
-		break;
-	case TPT_ERR_DDP_QUEUE_NUM:
-		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
-		*ecode = DDPU_INV_QN;
-		break;
-	case TPT_ERR_MSN:
-	case TPT_ERR_MSN_GAP:
-	case TPT_ERR_MSN_RANGE:
-	case TPT_ERR_IRD_OVERFLOW:
-		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
-		*ecode = DDPU_INV_MSN_RANGE;
-		break;
-	case TPT_ERR_TBIT:
-		*layer_type = LAYER_DDP|DDP_LOCAL_CATA;
-		*ecode = 0;
-		break;
-	case TPT_ERR_MO:
-		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
-		*ecode = DDPU_INV_MO;
-		break;
-	default:
-		*layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
-		*ecode = 0;
-		break;
-	}
-}
-
-int iwch_post_zb_read(struct iwch_ep *ep)
-{
-	union t3_wr *wqe;
-	struct sk_buff *skb;
-	u8 flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3;
-
-	pr_debug("%s enter\n", __func__);
-	skb = alloc_skb(40, GFP_KERNEL);
-	if (!skb) {
-		pr_err("%s cannot send zb_read!!\n", __func__);
-		return -ENOMEM;
-	}
-	wqe = skb_put_zero(skb, sizeof(struct t3_rdma_read_wr));
-	wqe->read.rdmaop = T3_READ_REQ;
-	wqe->read.reserved[0] = 0;
-	wqe->read.reserved[1] = 0;
-	wqe->read.rem_stag = cpu_to_be32(1);
-	wqe->read.rem_to = cpu_to_be64(1);
-	wqe->read.local_stag = cpu_to_be32(1);
-	wqe->read.local_len = cpu_to_be32(0);
-	wqe->read.local_to = cpu_to_be64(1);
-	wqe->send.wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_READ));
-	wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(ep->hwtid)|
-						V_FW_RIWR_LEN(flit_cnt));
-	skb->priority = CPL_PRIORITY_DATA;
-	return iwch_cxgb3_ofld_send(ep->com.qp->rhp->rdev.t3cdev_p, skb);
-}
-
-/*
- * This posts a TERMINATE with layer=RDMA, type=catastrophic.
- */
-int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg)
-{
-	union t3_wr *wqe;
-	struct terminate_message *term;
-	struct sk_buff *skb;
-
-	pr_debug("%s %d\n", __func__, __LINE__);
-	skb = alloc_skb(40, GFP_ATOMIC);
-	if (!skb) {
-		pr_err("%s cannot send TERMINATE!\n", __func__);
-		return -ENOMEM;
-	}
-	wqe = skb_put_zero(skb, 40);
-	wqe->send.rdmaop = T3_TERMINATE;
-
-	/* immediate data length */
-	wqe->send.plen = htonl(4);
-
-	/* immediate data starts here. */
-	term = (struct terminate_message *)wqe->send.sgl;
-	build_term_codes(rsp_msg, &term->layer_etype, &term->ecode);
-	wqe->send.wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_SEND) |
-			 V_FW_RIWR_FLAGS(T3_COMPLETION_FLAG | T3_NOTIFY_FLAG));
-	wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(qhp->ep->hwtid));
-	skb->priority = CPL_PRIORITY_DATA;
-	return iwch_cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb);
-}
-
-/*
- * Assumes qhp lock is held.
- */
-static void __flush_qp(struct iwch_qp *qhp, struct iwch_cq *rchp,
-				struct iwch_cq *schp)
-	__releases(&qhp->lock)
-	__acquires(&qhp->lock)
-{
-	int count;
-	int flushed;
-
-	lockdep_assert_held(&qhp->lock);
-
-	pr_debug("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp);
-	/* take a ref on the qhp since we must release the lock */
-	atomic_inc(&qhp->refcnt);
-	spin_unlock(&qhp->lock);
-
-	/* locking hierarchy: cq lock first, then qp lock. */
-	spin_lock(&rchp->lock);
-	spin_lock(&qhp->lock);
-	cxio_flush_hw_cq(&rchp->cq);
-	cxio_count_rcqes(&rchp->cq, &qhp->wq, &count);
-	flushed = cxio_flush_rq(&qhp->wq, &rchp->cq, count);
-	spin_unlock(&qhp->lock);
-	spin_unlock(&rchp->lock);
-	if (flushed) {
-		spin_lock(&rchp->comp_handler_lock);
-		(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
-		spin_unlock(&rchp->comp_handler_lock);
-	}
-
-	/* locking hierarchy: cq lock first, then qp lock. */
-	spin_lock(&schp->lock);
-	spin_lock(&qhp->lock);
-	cxio_flush_hw_cq(&schp->cq);
-	cxio_count_scqes(&schp->cq, &qhp->wq, &count);
-	flushed = cxio_flush_sq(&qhp->wq, &schp->cq, count);
-	spin_unlock(&qhp->lock);
-	spin_unlock(&schp->lock);
-	if (flushed) {
-		spin_lock(&schp->comp_handler_lock);
-		(*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context);
-		spin_unlock(&schp->comp_handler_lock);
-	}
-
-	/* deref */
-	if (atomic_dec_and_test(&qhp->refcnt))
-	        wake_up(&qhp->wait);
-
-	spin_lock(&qhp->lock);
-}
-
-static void flush_qp(struct iwch_qp *qhp)
-{
-	struct iwch_cq *rchp, *schp;
-
-	rchp = get_chp(qhp->rhp, qhp->attr.rcq);
-	schp = get_chp(qhp->rhp, qhp->attr.scq);
-
-	if (qhp->ibqp.uobject) {
-		cxio_set_wq_in_error(&qhp->wq);
-		cxio_set_cq_in_error(&rchp->cq);
-		spin_lock(&rchp->comp_handler_lock);
-		(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
-		spin_unlock(&rchp->comp_handler_lock);
-		if (schp != rchp) {
-			cxio_set_cq_in_error(&schp->cq);
-			spin_lock(&schp->comp_handler_lock);
-			(*schp->ibcq.comp_handler)(&schp->ibcq,
-						   schp->ibcq.cq_context);
-			spin_unlock(&schp->comp_handler_lock);
-		}
-		return;
-	}
-	__flush_qp(qhp, rchp, schp);
-}
-
-
-/*
- * Return count of RECV WRs posted
- */
-u16 iwch_rqes_posted(struct iwch_qp *qhp)
-{
-	union t3_wr *wqe = qhp->wq.queue;
-	u16 count = 0;
-
-	while (count < USHRT_MAX && fw_riwrh_opcode((struct fw_riwrh *)wqe) == T3_WR_RCV) {
-		count++;
-		wqe++;
-	}
-	pr_debug("%s qhp %p count %u\n", __func__, qhp, count);
-	return count;
-}
-
-static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp,
-				enum iwch_qp_attr_mask mask,
-				struct iwch_qp_attributes *attrs)
-{
-	struct t3_rdma_init_attr init_attr;
-	int ret;
-
-	init_attr.tid = qhp->ep->hwtid;
-	init_attr.qpid = qhp->wq.qpid;
-	init_attr.pdid = qhp->attr.pd;
-	init_attr.scqid = qhp->attr.scq;
-	init_attr.rcqid = qhp->attr.rcq;
-	init_attr.rq_addr = qhp->wq.rq_addr;
-	init_attr.rq_size = 1 << qhp->wq.rq_size_log2;
-	init_attr.mpaattrs = uP_RI_MPA_IETF_ENABLE |
-		qhp->attr.mpa_attr.recv_marker_enabled |
-		(qhp->attr.mpa_attr.xmit_marker_enabled << 1) |
-		(qhp->attr.mpa_attr.crc_enabled << 2);
-
-	init_attr.qpcaps = uP_RI_QP_RDMA_READ_ENABLE |
-			   uP_RI_QP_RDMA_WRITE_ENABLE |
-			   uP_RI_QP_BIND_ENABLE;
-	if (!qhp->ibqp.uobject)
-		init_attr.qpcaps |= uP_RI_QP_STAG0_ENABLE |
-				    uP_RI_QP_FAST_REGISTER_ENABLE;
-
-	init_attr.tcp_emss = qhp->ep->emss;
-	init_attr.ord = qhp->attr.max_ord;
-	init_attr.ird = qhp->attr.max_ird;
-	init_attr.qp_dma_addr = qhp->wq.dma_addr;
-	init_attr.qp_dma_size = (1UL << qhp->wq.size_log2);
-	init_attr.rqe_count = iwch_rqes_posted(qhp);
-	init_attr.flags = qhp->attr.mpa_attr.initiator ? MPA_INITIATOR : 0;
-	init_attr.chan = qhp->ep->l2t->smt_idx;
-	if (peer2peer) {
-		init_attr.rtr_type = RTR_READ;
-		if (init_attr.ord == 0 && qhp->attr.mpa_attr.initiator)
-			init_attr.ord = 1;
-		if (init_attr.ird == 0 && !qhp->attr.mpa_attr.initiator)
-			init_attr.ird = 1;
-	} else
-		init_attr.rtr_type = 0;
-	init_attr.irs = qhp->ep->rcv_seq;
-	pr_debug("%s init_attr.rq_addr 0x%x init_attr.rq_size = %d flags 0x%x qpcaps 0x%x\n",
-		 __func__,
-		 init_attr.rq_addr, init_attr.rq_size,
-		 init_attr.flags, init_attr.qpcaps);
-	ret = cxio_rdma_init(&rhp->rdev, &init_attr);
-	pr_debug("%s ret %d\n", __func__, ret);
-	return ret;
-}
-
-int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp,
-				enum iwch_qp_attr_mask mask,
-				struct iwch_qp_attributes *attrs,
-				int internal)
-{
-	int ret = 0;
-	struct iwch_qp_attributes newattr = qhp->attr;
-	unsigned long flag;
-	int disconnect = 0;
-	int terminate = 0;
-	int abort = 0;
-	int free = 0;
-	struct iwch_ep *ep = NULL;
-
-	pr_debug("%s qhp %p qpid 0x%x ep %p state %d -> %d\n", __func__,
-		 qhp, qhp->wq.qpid, qhp->ep, qhp->attr.state,
-		 (mask & IWCH_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1);
-
-	spin_lock_irqsave(&qhp->lock, flag);
-
-	/* Process attr changes if in IDLE */
-	if (mask & IWCH_QP_ATTR_VALID_MODIFY) {
-		if (qhp->attr.state != IWCH_QP_STATE_IDLE) {
-			ret = -EIO;
-			goto out;
-		}
-		if (mask & IWCH_QP_ATTR_ENABLE_RDMA_READ)
-			newattr.enable_rdma_read = attrs->enable_rdma_read;
-		if (mask & IWCH_QP_ATTR_ENABLE_RDMA_WRITE)
-			newattr.enable_rdma_write = attrs->enable_rdma_write;
-		if (mask & IWCH_QP_ATTR_ENABLE_RDMA_BIND)
-			newattr.enable_bind = attrs->enable_bind;
-		if (mask & IWCH_QP_ATTR_MAX_ORD) {
-			if (attrs->max_ord >
-			    rhp->attr.max_rdma_read_qp_depth) {
-				ret = -EINVAL;
-				goto out;
-			}
-			newattr.max_ord = attrs->max_ord;
-		}
-		if (mask & IWCH_QP_ATTR_MAX_IRD) {
-			if (attrs->max_ird >
-			    rhp->attr.max_rdma_reads_per_qp) {
-				ret = -EINVAL;
-				goto out;
-			}
-			newattr.max_ird = attrs->max_ird;
-		}
-		qhp->attr = newattr;
-	}
-
-	if (!(mask & IWCH_QP_ATTR_NEXT_STATE))
-		goto out;
-	if (qhp->attr.state == attrs->next_state)
-		goto out;
-
-	switch (qhp->attr.state) {
-	case IWCH_QP_STATE_IDLE:
-		switch (attrs->next_state) {
-		case IWCH_QP_STATE_RTS:
-			if (!(mask & IWCH_QP_ATTR_LLP_STREAM_HANDLE)) {
-				ret = -EINVAL;
-				goto out;
-			}
-			if (!(mask & IWCH_QP_ATTR_MPA_ATTR)) {
-				ret = -EINVAL;
-				goto out;
-			}
-			qhp->attr.mpa_attr = attrs->mpa_attr;
-			qhp->attr.llp_stream_handle = attrs->llp_stream_handle;
-			qhp->ep = qhp->attr.llp_stream_handle;
-			qhp->attr.state = IWCH_QP_STATE_RTS;
-
-			/*
-			 * Ref the endpoint here and deref when we
-			 * disassociate the endpoint from the QP.  This
-			 * happens in CLOSING->IDLE transition or *->ERROR
-			 * transition.
-			 */
-			get_ep(&qhp->ep->com);
-			spin_unlock_irqrestore(&qhp->lock, flag);
-			ret = rdma_init(rhp, qhp, mask, attrs);
-			spin_lock_irqsave(&qhp->lock, flag);
-			if (ret)
-				goto err;
-			break;
-		case IWCH_QP_STATE_ERROR:
-			qhp->attr.state = IWCH_QP_STATE_ERROR;
-			flush_qp(qhp);
-			break;
-		default:
-			ret = -EINVAL;
-			goto out;
-		}
-		break;
-	case IWCH_QP_STATE_RTS:
-		switch (attrs->next_state) {
-		case IWCH_QP_STATE_CLOSING:
-			BUG_ON(kref_read(&qhp->ep->com.kref) < 2);
-			qhp->attr.state = IWCH_QP_STATE_CLOSING;
-			if (!internal) {
-				abort=0;
-				disconnect = 1;
-				ep = qhp->ep;
-				get_ep(&ep->com);
-			}
-			break;
-		case IWCH_QP_STATE_TERMINATE:
-			qhp->attr.state = IWCH_QP_STATE_TERMINATE;
-			if (qhp->ibqp.uobject)
-				cxio_set_wq_in_error(&qhp->wq);
-			if (!internal)
-				terminate = 1;
-			break;
-		case IWCH_QP_STATE_ERROR:
-			qhp->attr.state = IWCH_QP_STATE_ERROR;
-			if (!internal) {
-				abort=1;
-				disconnect = 1;
-				ep = qhp->ep;
-				get_ep(&ep->com);
-			}
-			goto err;
-			break;
-		default:
-			ret = -EINVAL;
-			goto out;
-		}
-		break;
-	case IWCH_QP_STATE_CLOSING:
-		if (!internal) {
-			ret = -EINVAL;
-			goto out;
-		}
-		switch (attrs->next_state) {
-			case IWCH_QP_STATE_IDLE:
-				flush_qp(qhp);
-				qhp->attr.state = IWCH_QP_STATE_IDLE;
-				qhp->attr.llp_stream_handle = NULL;
-				put_ep(&qhp->ep->com);
-				qhp->ep = NULL;
-				wake_up(&qhp->wait);
-				break;
-			case IWCH_QP_STATE_ERROR:
-				goto err;
-			default:
-				ret = -EINVAL;
-				goto err;
-		}
-		break;
-	case IWCH_QP_STATE_ERROR:
-		if (attrs->next_state != IWCH_QP_STATE_IDLE) {
-			ret = -EINVAL;
-			goto out;
-		}
-
-		if (!Q_EMPTY(qhp->wq.sq_rptr, qhp->wq.sq_wptr) ||
-		    !Q_EMPTY(qhp->wq.rq_rptr, qhp->wq.rq_wptr)) {
-			ret = -EINVAL;
-			goto out;
-		}
-		qhp->attr.state = IWCH_QP_STATE_IDLE;
-		break;
-	case IWCH_QP_STATE_TERMINATE:
-		if (!internal) {
-			ret = -EINVAL;
-			goto out;
-		}
-		goto err;
-		break;
-	default:
-		pr_err("%s in a bad state %d\n", __func__, qhp->attr.state);
-		ret = -EINVAL;
-		goto err;
-		break;
-	}
-	goto out;
-err:
-	pr_debug("%s disassociating ep %p qpid 0x%x\n", __func__, qhp->ep,
-		 qhp->wq.qpid);
-
-	/* disassociate the LLP connection */
-	qhp->attr.llp_stream_handle = NULL;
-	ep = qhp->ep;
-	qhp->ep = NULL;
-	qhp->attr.state = IWCH_QP_STATE_ERROR;
-	free=1;
-	wake_up(&qhp->wait);
-	BUG_ON(!ep);
-	flush_qp(qhp);
-out:
-	spin_unlock_irqrestore(&qhp->lock, flag);
-
-	if (terminate)
-		iwch_post_terminate(qhp, NULL);
-
-	/*
-	 * If disconnect is 1, then we need to initiate a disconnect
-	 * on the EP.  This can be a normal close (RTS->CLOSING) or
-	 * an abnormal close (RTS/CLOSING->ERROR).
-	 */
-	if (disconnect) {
-		iwch_ep_disconnect(ep, abort, GFP_KERNEL);
-		put_ep(&ep->com);
-	}
-
-	/*
-	 * If free is 1, then we've disassociated the EP from the QP
-	 * and we need to dereference the EP.
-	 */
-	if (free)
-		put_ep(&ep->com);
-
-	pr_debug("%s exit state %d\n", __func__, qhp->attr.state);
-	return ret;
-}
diff --git a/drivers/infiniband/hw/cxgb3/tcb.h b/drivers/infiniband/hw/cxgb3/tcb.h
deleted file mode 100644
index c702dc1..0000000
--- a/drivers/infiniband/hw/cxgb3/tcb.h
+++ /dev/null
@@ -1,632 +0,0 @@
-/*
- * Copyright (c) 2007 Chelsio, Inc. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef _TCB_DEFS_H
-#define _TCB_DEFS_H
-
-#define W_TCB_T_STATE    0
-#define S_TCB_T_STATE    0
-#define M_TCB_T_STATE    0xfULL
-#define V_TCB_T_STATE(x) ((x) << S_TCB_T_STATE)
-
-#define W_TCB_TIMER    0
-#define S_TCB_TIMER    4
-#define M_TCB_TIMER    0x1ULL
-#define V_TCB_TIMER(x) ((x) << S_TCB_TIMER)
-
-#define W_TCB_DACK_TIMER    0
-#define S_TCB_DACK_TIMER    5
-#define M_TCB_DACK_TIMER    0x1ULL
-#define V_TCB_DACK_TIMER(x) ((x) << S_TCB_DACK_TIMER)
-
-#define W_TCB_DEL_FLAG    0
-#define S_TCB_DEL_FLAG    6
-#define M_TCB_DEL_FLAG    0x1ULL
-#define V_TCB_DEL_FLAG(x) ((x) << S_TCB_DEL_FLAG)
-
-#define W_TCB_L2T_IX    0
-#define S_TCB_L2T_IX    7
-#define M_TCB_L2T_IX    0x7ffULL
-#define V_TCB_L2T_IX(x) ((x) << S_TCB_L2T_IX)
-
-#define W_TCB_SMAC_SEL    0
-#define S_TCB_SMAC_SEL    18
-#define M_TCB_SMAC_SEL    0x3ULL
-#define V_TCB_SMAC_SEL(x) ((x) << S_TCB_SMAC_SEL)
-
-#define W_TCB_TOS    0
-#define S_TCB_TOS    20
-#define M_TCB_TOS    0x3fULL
-#define V_TCB_TOS(x) ((x) << S_TCB_TOS)
-
-#define W_TCB_MAX_RT    0
-#define S_TCB_MAX_RT    26
-#define M_TCB_MAX_RT    0xfULL
-#define V_TCB_MAX_RT(x) ((x) << S_TCB_MAX_RT)
-
-#define W_TCB_T_RXTSHIFT    0
-#define S_TCB_T_RXTSHIFT    30
-#define M_TCB_T_RXTSHIFT    0xfULL
-#define V_TCB_T_RXTSHIFT(x) ((x) << S_TCB_T_RXTSHIFT)
-
-#define W_TCB_T_DUPACKS    1
-#define S_TCB_T_DUPACKS    2
-#define M_TCB_T_DUPACKS    0xfULL
-#define V_TCB_T_DUPACKS(x) ((x) << S_TCB_T_DUPACKS)
-
-#define W_TCB_T_MAXSEG    1
-#define S_TCB_T_MAXSEG    6
-#define M_TCB_T_MAXSEG    0xfULL
-#define V_TCB_T_MAXSEG(x) ((x) << S_TCB_T_MAXSEG)
-
-#define W_TCB_T_FLAGS1    1
-#define S_TCB_T_FLAGS1    10
-#define M_TCB_T_FLAGS1    0xffffffffULL
-#define V_TCB_T_FLAGS1(x) ((x) << S_TCB_T_FLAGS1)
-
-#define W_TCB_T_MIGRATION    1
-#define S_TCB_T_MIGRATION    20
-#define M_TCB_T_MIGRATION    0x1ULL
-#define V_TCB_T_MIGRATION(x) ((x) << S_TCB_T_MIGRATION)
-
-#define W_TCB_T_FLAGS2    2
-#define S_TCB_T_FLAGS2    10
-#define M_TCB_T_FLAGS2    0x7fULL
-#define V_TCB_T_FLAGS2(x) ((x) << S_TCB_T_FLAGS2)
-
-#define W_TCB_SND_SCALE    2
-#define S_TCB_SND_SCALE    17
-#define M_TCB_SND_SCALE    0xfULL
-#define V_TCB_SND_SCALE(x) ((x) << S_TCB_SND_SCALE)
-
-#define W_TCB_RCV_SCALE    2
-#define S_TCB_RCV_SCALE    21
-#define M_TCB_RCV_SCALE    0xfULL
-#define V_TCB_RCV_SCALE(x) ((x) << S_TCB_RCV_SCALE)
-
-#define W_TCB_SND_UNA_RAW    2
-#define S_TCB_SND_UNA_RAW    25
-#define M_TCB_SND_UNA_RAW    0x7ffffffULL
-#define V_TCB_SND_UNA_RAW(x) ((x) << S_TCB_SND_UNA_RAW)
-
-#define W_TCB_SND_NXT_RAW    3
-#define S_TCB_SND_NXT_RAW    20
-#define M_TCB_SND_NXT_RAW    0x7ffffffULL
-#define V_TCB_SND_NXT_RAW(x) ((x) << S_TCB_SND_NXT_RAW)
-
-#define W_TCB_RCV_NXT    4
-#define S_TCB_RCV_NXT    15
-#define M_TCB_RCV_NXT    0xffffffffULL
-#define V_TCB_RCV_NXT(x) ((x) << S_TCB_RCV_NXT)
-
-#define W_TCB_RCV_ADV    5
-#define S_TCB_RCV_ADV    15
-#define M_TCB_RCV_ADV    0xffffULL
-#define V_TCB_RCV_ADV(x) ((x) << S_TCB_RCV_ADV)
-
-#define W_TCB_SND_MAX_RAW    5
-#define S_TCB_SND_MAX_RAW    31
-#define M_TCB_SND_MAX_RAW    0x7ffffffULL
-#define V_TCB_SND_MAX_RAW(x) ((x) << S_TCB_SND_MAX_RAW)
-
-#define W_TCB_SND_CWND    6
-#define S_TCB_SND_CWND    26
-#define M_TCB_SND_CWND    0x7ffffffULL
-#define V_TCB_SND_CWND(x) ((x) << S_TCB_SND_CWND)
-
-#define W_TCB_SND_SSTHRESH    7
-#define S_TCB_SND_SSTHRESH    21
-#define M_TCB_SND_SSTHRESH    0x7ffffffULL
-#define V_TCB_SND_SSTHRESH(x) ((x) << S_TCB_SND_SSTHRESH)
-
-#define W_TCB_T_RTT_TS_RECENT_AGE    8
-#define S_TCB_T_RTT_TS_RECENT_AGE    16
-#define M_TCB_T_RTT_TS_RECENT_AGE    0xffffffffULL
-#define V_TCB_T_RTT_TS_RECENT_AGE(x) ((x) << S_TCB_T_RTT_TS_RECENT_AGE)
-
-#define W_TCB_T_RTSEQ_RECENT    9
-#define S_TCB_T_RTSEQ_RECENT    16
-#define M_TCB_T_RTSEQ_RECENT    0xffffffffULL
-#define V_TCB_T_RTSEQ_RECENT(x) ((x) << S_TCB_T_RTSEQ_RECENT)
-
-#define W_TCB_T_SRTT    10
-#define S_TCB_T_SRTT    16
-#define M_TCB_T_SRTT    0xffffULL
-#define V_TCB_T_SRTT(x) ((x) << S_TCB_T_SRTT)
-
-#define W_TCB_T_RTTVAR    11
-#define S_TCB_T_RTTVAR    0
-#define M_TCB_T_RTTVAR    0xffffULL
-#define V_TCB_T_RTTVAR(x) ((x) << S_TCB_T_RTTVAR)
-
-#define W_TCB_TS_LAST_ACK_SENT_RAW    11
-#define S_TCB_TS_LAST_ACK_SENT_RAW    16
-#define M_TCB_TS_LAST_ACK_SENT_RAW    0x7ffffffULL
-#define V_TCB_TS_LAST_ACK_SENT_RAW(x) ((x) << S_TCB_TS_LAST_ACK_SENT_RAW)
-
-#define W_TCB_DIP    12
-#define S_TCB_DIP    11
-#define M_TCB_DIP    0xffffffffULL
-#define V_TCB_DIP(x) ((x) << S_TCB_DIP)
-
-#define W_TCB_SIP    13
-#define S_TCB_SIP    11
-#define M_TCB_SIP    0xffffffffULL
-#define V_TCB_SIP(x) ((x) << S_TCB_SIP)
-
-#define W_TCB_DP    14
-#define S_TCB_DP    11
-#define M_TCB_DP    0xffffULL
-#define V_TCB_DP(x) ((x) << S_TCB_DP)
-
-#define W_TCB_SP    14
-#define S_TCB_SP    27
-#define M_TCB_SP    0xffffULL
-#define V_TCB_SP(x) ((x) << S_TCB_SP)
-
-#define W_TCB_TIMESTAMP    15
-#define S_TCB_TIMESTAMP    11
-#define M_TCB_TIMESTAMP    0xffffffffULL
-#define V_TCB_TIMESTAMP(x) ((x) << S_TCB_TIMESTAMP)
-
-#define W_TCB_TIMESTAMP_OFFSET    16
-#define S_TCB_TIMESTAMP_OFFSET    11
-#define M_TCB_TIMESTAMP_OFFSET    0xfULL
-#define V_TCB_TIMESTAMP_OFFSET(x) ((x) << S_TCB_TIMESTAMP_OFFSET)
-
-#define W_TCB_TX_MAX    16
-#define S_TCB_TX_MAX    15
-#define M_TCB_TX_MAX    0xffffffffULL
-#define V_TCB_TX_MAX(x) ((x) << S_TCB_TX_MAX)
-
-#define W_TCB_TX_HDR_PTR_RAW    17
-#define S_TCB_TX_HDR_PTR_RAW    15
-#define M_TCB_TX_HDR_PTR_RAW    0x1ffffULL
-#define V_TCB_TX_HDR_PTR_RAW(x) ((x) << S_TCB_TX_HDR_PTR_RAW)
-
-#define W_TCB_TX_LAST_PTR_RAW    18
-#define S_TCB_TX_LAST_PTR_RAW    0
-#define M_TCB_TX_LAST_PTR_RAW    0x1ffffULL
-#define V_TCB_TX_LAST_PTR_RAW(x) ((x) << S_TCB_TX_LAST_PTR_RAW)
-
-#define W_TCB_TX_COMPACT    18
-#define S_TCB_TX_COMPACT    17
-#define M_TCB_TX_COMPACT    0x1ULL
-#define V_TCB_TX_COMPACT(x) ((x) << S_TCB_TX_COMPACT)
-
-#define W_TCB_RX_COMPACT    18
-#define S_TCB_RX_COMPACT    18
-#define M_TCB_RX_COMPACT    0x1ULL
-#define V_TCB_RX_COMPACT(x) ((x) << S_TCB_RX_COMPACT)
-
-#define W_TCB_RCV_WND    18
-#define S_TCB_RCV_WND    19
-#define M_TCB_RCV_WND    0x7ffffffULL
-#define V_TCB_RCV_WND(x) ((x) << S_TCB_RCV_WND)
-
-#define W_TCB_RX_HDR_OFFSET    19
-#define S_TCB_RX_HDR_OFFSET    14
-#define M_TCB_RX_HDR_OFFSET    0x7ffffffULL
-#define V_TCB_RX_HDR_OFFSET(x) ((x) << S_TCB_RX_HDR_OFFSET)
-
-#define W_TCB_RX_FRAG0_START_IDX_RAW    20
-#define S_TCB_RX_FRAG0_START_IDX_RAW    9
-#define M_TCB_RX_FRAG0_START_IDX_RAW    0x7ffffffULL
-#define V_TCB_RX_FRAG0_START_IDX_RAW(x) ((x) << S_TCB_RX_FRAG0_START_IDX_RAW)
-
-#define W_TCB_RX_FRAG1_START_IDX_OFFSET    21
-#define S_TCB_RX_FRAG1_START_IDX_OFFSET    4
-#define M_TCB_RX_FRAG1_START_IDX_OFFSET    0x7ffffffULL
-#define V_TCB_RX_FRAG1_START_IDX_OFFSET(x) ((x) << S_TCB_RX_FRAG1_START_IDX_OFFSET)
-
-#define W_TCB_RX_FRAG0_LEN    21
-#define S_TCB_RX_FRAG0_LEN    31
-#define M_TCB_RX_FRAG0_LEN    0x7ffffffULL
-#define V_TCB_RX_FRAG0_LEN(x) ((x) << S_TCB_RX_FRAG0_LEN)
-
-#define W_TCB_RX_FRAG1_LEN    22
-#define S_TCB_RX_FRAG1_LEN    26
-#define M_TCB_RX_FRAG1_LEN    0x7ffffffULL
-#define V_TCB_RX_FRAG1_LEN(x) ((x) << S_TCB_RX_FRAG1_LEN)
-
-#define W_TCB_NEWRENO_RECOVER    23
-#define S_TCB_NEWRENO_RECOVER    21
-#define M_TCB_NEWRENO_RECOVER    0x7ffffffULL
-#define V_TCB_NEWRENO_RECOVER(x) ((x) << S_TCB_NEWRENO_RECOVER)
-
-#define W_TCB_PDU_HAVE_LEN    24
-#define S_TCB_PDU_HAVE_LEN    16
-#define M_TCB_PDU_HAVE_LEN    0x1ULL
-#define V_TCB_PDU_HAVE_LEN(x) ((x) << S_TCB_PDU_HAVE_LEN)
-
-#define W_TCB_PDU_LEN    24
-#define S_TCB_PDU_LEN    17
-#define M_TCB_PDU_LEN    0xffffULL
-#define V_TCB_PDU_LEN(x) ((x) << S_TCB_PDU_LEN)
-
-#define W_TCB_RX_QUIESCE    25
-#define S_TCB_RX_QUIESCE    1
-#define M_TCB_RX_QUIESCE    0x1ULL
-#define V_TCB_RX_QUIESCE(x) ((x) << S_TCB_RX_QUIESCE)
-
-#define W_TCB_RX_PTR_RAW    25
-#define S_TCB_RX_PTR_RAW    2
-#define M_TCB_RX_PTR_RAW    0x1ffffULL
-#define V_TCB_RX_PTR_RAW(x) ((x) << S_TCB_RX_PTR_RAW)
-
-#define W_TCB_CPU_NO    25
-#define S_TCB_CPU_NO    19
-#define M_TCB_CPU_NO    0x7fULL
-#define V_TCB_CPU_NO(x) ((x) << S_TCB_CPU_NO)
-
-#define W_TCB_ULP_TYPE    25
-#define S_TCB_ULP_TYPE    26
-#define M_TCB_ULP_TYPE    0xfULL
-#define V_TCB_ULP_TYPE(x) ((x) << S_TCB_ULP_TYPE)
-
-#define W_TCB_RX_FRAG1_PTR_RAW    25
-#define S_TCB_RX_FRAG1_PTR_RAW    30
-#define M_TCB_RX_FRAG1_PTR_RAW    0x1ffffULL
-#define V_TCB_RX_FRAG1_PTR_RAW(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW)
-
-#define W_TCB_RX_FRAG2_START_IDX_OFFSET_RAW    26
-#define S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW    15
-#define M_TCB_RX_FRAG2_START_IDX_OFFSET_RAW    0x7ffffffULL
-#define V_TCB_RX_FRAG2_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW)
-
-#define W_TCB_RX_FRAG2_PTR_RAW    27
-#define S_TCB_RX_FRAG2_PTR_RAW    10
-#define M_TCB_RX_FRAG2_PTR_RAW    0x1ffffULL
-#define V_TCB_RX_FRAG2_PTR_RAW(x) ((x) << S_TCB_RX_FRAG2_PTR_RAW)
-
-#define W_TCB_RX_FRAG2_LEN_RAW    27
-#define S_TCB_RX_FRAG2_LEN_RAW    27
-#define M_TCB_RX_FRAG2_LEN_RAW    0x7ffffffULL
-#define V_TCB_RX_FRAG2_LEN_RAW(x) ((x) << S_TCB_RX_FRAG2_LEN_RAW)
-
-#define W_TCB_RX_FRAG3_PTR_RAW    28
-#define S_TCB_RX_FRAG3_PTR_RAW    22
-#define M_TCB_RX_FRAG3_PTR_RAW    0x1ffffULL
-#define V_TCB_RX_FRAG3_PTR_RAW(x) ((x) << S_TCB_RX_FRAG3_PTR_RAW)
-
-#define W_TCB_RX_FRAG3_LEN_RAW    29
-#define S_TCB_RX_FRAG3_LEN_RAW    7
-#define M_TCB_RX_FRAG3_LEN_RAW    0x7ffffffULL
-#define V_TCB_RX_FRAG3_LEN_RAW(x) ((x) << S_TCB_RX_FRAG3_LEN_RAW)
-
-#define W_TCB_RX_FRAG3_START_IDX_OFFSET_RAW    30
-#define S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW    2
-#define M_TCB_RX_FRAG3_START_IDX_OFFSET_RAW    0x7ffffffULL
-#define V_TCB_RX_FRAG3_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW)
-
-#define W_TCB_PDU_HDR_LEN    30
-#define S_TCB_PDU_HDR_LEN    29
-#define M_TCB_PDU_HDR_LEN    0xffULL
-#define V_TCB_PDU_HDR_LEN(x) ((x) << S_TCB_PDU_HDR_LEN)
-
-#define W_TCB_SLUSH1    31
-#define S_TCB_SLUSH1    5
-#define M_TCB_SLUSH1    0x7ffffULL
-#define V_TCB_SLUSH1(x) ((x) << S_TCB_SLUSH1)
-
-#define W_TCB_ULP_RAW    31
-#define S_TCB_ULP_RAW    24
-#define M_TCB_ULP_RAW    0xffULL
-#define V_TCB_ULP_RAW(x) ((x) << S_TCB_ULP_RAW)
-
-#define W_TCB_DDP_RDMAP_VERSION    25
-#define S_TCB_DDP_RDMAP_VERSION    30
-#define M_TCB_DDP_RDMAP_VERSION    0x1ULL
-#define V_TCB_DDP_RDMAP_VERSION(x) ((x) << S_TCB_DDP_RDMAP_VERSION)
-
-#define W_TCB_MARKER_ENABLE_RX    25
-#define S_TCB_MARKER_ENABLE_RX    31
-#define M_TCB_MARKER_ENABLE_RX    0x1ULL
-#define V_TCB_MARKER_ENABLE_RX(x) ((x) << S_TCB_MARKER_ENABLE_RX)
-
-#define W_TCB_MARKER_ENABLE_TX    26
-#define S_TCB_MARKER_ENABLE_TX    0
-#define M_TCB_MARKER_ENABLE_TX    0x1ULL
-#define V_TCB_MARKER_ENABLE_TX(x) ((x) << S_TCB_MARKER_ENABLE_TX)
-
-#define W_TCB_CRC_ENABLE    26
-#define S_TCB_CRC_ENABLE    1
-#define M_TCB_CRC_ENABLE    0x1ULL
-#define V_TCB_CRC_ENABLE(x) ((x) << S_TCB_CRC_ENABLE)
-
-#define W_TCB_IRS_ULP    26
-#define S_TCB_IRS_ULP    2
-#define M_TCB_IRS_ULP    0x1ffULL
-#define V_TCB_IRS_ULP(x) ((x) << S_TCB_IRS_ULP)
-
-#define W_TCB_ISS_ULP    26
-#define S_TCB_ISS_ULP    11
-#define M_TCB_ISS_ULP    0x1ffULL
-#define V_TCB_ISS_ULP(x) ((x) << S_TCB_ISS_ULP)
-
-#define W_TCB_TX_PDU_LEN    26
-#define S_TCB_TX_PDU_LEN    20
-#define M_TCB_TX_PDU_LEN    0x3fffULL
-#define V_TCB_TX_PDU_LEN(x) ((x) << S_TCB_TX_PDU_LEN)
-
-#define W_TCB_TX_PDU_OUT    27
-#define S_TCB_TX_PDU_OUT    2
-#define M_TCB_TX_PDU_OUT    0x1ULL
-#define V_TCB_TX_PDU_OUT(x) ((x) << S_TCB_TX_PDU_OUT)
-
-#define W_TCB_CQ_IDX_SQ    27
-#define S_TCB_CQ_IDX_SQ    3
-#define M_TCB_CQ_IDX_SQ    0xffffULL
-#define V_TCB_CQ_IDX_SQ(x) ((x) << S_TCB_CQ_IDX_SQ)
-
-#define W_TCB_CQ_IDX_RQ    27
-#define S_TCB_CQ_IDX_RQ    19
-#define M_TCB_CQ_IDX_RQ    0xffffULL
-#define V_TCB_CQ_IDX_RQ(x) ((x) << S_TCB_CQ_IDX_RQ)
-
-#define W_TCB_QP_ID    28
-#define S_TCB_QP_ID    3
-#define M_TCB_QP_ID    0xffffULL
-#define V_TCB_QP_ID(x) ((x) << S_TCB_QP_ID)
-
-#define W_TCB_PD_ID    28
-#define S_TCB_PD_ID    19
-#define M_TCB_PD_ID    0xffffULL
-#define V_TCB_PD_ID(x) ((x) << S_TCB_PD_ID)
-
-#define W_TCB_STAG    29
-#define S_TCB_STAG    3
-#define M_TCB_STAG    0xffffffffULL
-#define V_TCB_STAG(x) ((x) << S_TCB_STAG)
-
-#define W_TCB_RQ_START    30
-#define S_TCB_RQ_START    3
-#define M_TCB_RQ_START    0x3ffffffULL
-#define V_TCB_RQ_START(x) ((x) << S_TCB_RQ_START)
-
-#define W_TCB_RQ_MSN    30
-#define S_TCB_RQ_MSN    29
-#define M_TCB_RQ_MSN    0x3ffULL
-#define V_TCB_RQ_MSN(x) ((x) << S_TCB_RQ_MSN)
-
-#define W_TCB_RQ_MAX_OFFSET    31
-#define S_TCB_RQ_MAX_OFFSET    7
-#define M_TCB_RQ_MAX_OFFSET    0xfULL
-#define V_TCB_RQ_MAX_OFFSET(x) ((x) << S_TCB_RQ_MAX_OFFSET)
-
-#define W_TCB_RQ_WRITE_PTR    31
-#define S_TCB_RQ_WRITE_PTR    11
-#define M_TCB_RQ_WRITE_PTR    0x3ffULL
-#define V_TCB_RQ_WRITE_PTR(x) ((x) << S_TCB_RQ_WRITE_PTR)
-
-#define W_TCB_INB_WRITE_PERM    31
-#define S_TCB_INB_WRITE_PERM    21
-#define M_TCB_INB_WRITE_PERM    0x1ULL
-#define V_TCB_INB_WRITE_PERM(x) ((x) << S_TCB_INB_WRITE_PERM)
-
-#define W_TCB_INB_READ_PERM    31
-#define S_TCB_INB_READ_PERM    22
-#define M_TCB_INB_READ_PERM    0x1ULL
-#define V_TCB_INB_READ_PERM(x) ((x) << S_TCB_INB_READ_PERM)
-
-#define W_TCB_ORD_L_BIT_VLD    31
-#define S_TCB_ORD_L_BIT_VLD    23
-#define M_TCB_ORD_L_BIT_VLD    0x1ULL
-#define V_TCB_ORD_L_BIT_VLD(x) ((x) << S_TCB_ORD_L_BIT_VLD)
-
-#define W_TCB_RDMAP_OPCODE    31
-#define S_TCB_RDMAP_OPCODE    24
-#define M_TCB_RDMAP_OPCODE    0xfULL
-#define V_TCB_RDMAP_OPCODE(x) ((x) << S_TCB_RDMAP_OPCODE)
-
-#define W_TCB_TX_FLUSH    31
-#define S_TCB_TX_FLUSH    28
-#define M_TCB_TX_FLUSH    0x1ULL
-#define V_TCB_TX_FLUSH(x) ((x) << S_TCB_TX_FLUSH)
-
-#define W_TCB_TX_OOS_RXMT    31
-#define S_TCB_TX_OOS_RXMT    29
-#define M_TCB_TX_OOS_RXMT    0x1ULL
-#define V_TCB_TX_OOS_RXMT(x) ((x) << S_TCB_TX_OOS_RXMT)
-
-#define W_TCB_TX_OOS_TXMT    31
-#define S_TCB_TX_OOS_TXMT    30
-#define M_TCB_TX_OOS_TXMT    0x1ULL
-#define V_TCB_TX_OOS_TXMT(x) ((x) << S_TCB_TX_OOS_TXMT)
-
-#define W_TCB_SLUSH_AUX2    31
-#define S_TCB_SLUSH_AUX2    31
-#define M_TCB_SLUSH_AUX2    0x1ULL
-#define V_TCB_SLUSH_AUX2(x) ((x) << S_TCB_SLUSH_AUX2)
-
-#define W_TCB_RX_FRAG1_PTR_RAW2    25
-#define S_TCB_RX_FRAG1_PTR_RAW2    30
-#define M_TCB_RX_FRAG1_PTR_RAW2    0x1ffffULL
-#define V_TCB_RX_FRAG1_PTR_RAW2(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW2)
-
-#define W_TCB_RX_DDP_FLAGS    26
-#define S_TCB_RX_DDP_FLAGS    15
-#define M_TCB_RX_DDP_FLAGS    0x3ffULL
-#define V_TCB_RX_DDP_FLAGS(x) ((x) << S_TCB_RX_DDP_FLAGS)
-
-#define W_TCB_SLUSH_AUX3    26
-#define S_TCB_SLUSH_AUX3    31
-#define M_TCB_SLUSH_AUX3    0x1ffULL
-#define V_TCB_SLUSH_AUX3(x) ((x) << S_TCB_SLUSH_AUX3)
-
-#define W_TCB_RX_DDP_BUF0_OFFSET    27
-#define S_TCB_RX_DDP_BUF0_OFFSET    8
-#define M_TCB_RX_DDP_BUF0_OFFSET    0x3fffffULL
-#define V_TCB_RX_DDP_BUF0_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF0_OFFSET)
-
-#define W_TCB_RX_DDP_BUF0_LEN    27
-#define S_TCB_RX_DDP_BUF0_LEN    30
-#define M_TCB_RX_DDP_BUF0_LEN    0x3fffffULL
-#define V_TCB_RX_DDP_BUF0_LEN(x) ((x) << S_TCB_RX_DDP_BUF0_LEN)
-
-#define W_TCB_RX_DDP_BUF1_OFFSET    28
-#define S_TCB_RX_DDP_BUF1_OFFSET    20
-#define M_TCB_RX_DDP_BUF1_OFFSET    0x3fffffULL
-#define V_TCB_RX_DDP_BUF1_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF1_OFFSET)
-
-#define W_TCB_RX_DDP_BUF1_LEN    29
-#define S_TCB_RX_DDP_BUF1_LEN    10
-#define M_TCB_RX_DDP_BUF1_LEN    0x3fffffULL
-#define V_TCB_RX_DDP_BUF1_LEN(x) ((x) << S_TCB_RX_DDP_BUF1_LEN)
-
-#define W_TCB_RX_DDP_BUF0_TAG    30
-#define S_TCB_RX_DDP_BUF0_TAG    0
-#define M_TCB_RX_DDP_BUF0_TAG    0xffffffffULL
-#define V_TCB_RX_DDP_BUF0_TAG(x) ((x) << S_TCB_RX_DDP_BUF0_TAG)
-
-#define W_TCB_RX_DDP_BUF1_TAG    31
-#define S_TCB_RX_DDP_BUF1_TAG    0
-#define M_TCB_RX_DDP_BUF1_TAG    0xffffffffULL
-#define V_TCB_RX_DDP_BUF1_TAG(x) ((x) << S_TCB_RX_DDP_BUF1_TAG)
-
-#define S_TF_DACK    10
-#define V_TF_DACK(x) ((x) << S_TF_DACK)
-
-#define S_TF_NAGLE    11
-#define V_TF_NAGLE(x) ((x) << S_TF_NAGLE)
-
-#define S_TF_RECV_SCALE    12
-#define V_TF_RECV_SCALE(x) ((x) << S_TF_RECV_SCALE)
-
-#define S_TF_RECV_TSTMP    13
-#define V_TF_RECV_TSTMP(x) ((x) << S_TF_RECV_TSTMP)
-
-#define S_TF_RECV_SACK    14
-#define V_TF_RECV_SACK(x) ((x) << S_TF_RECV_SACK)
-
-#define S_TF_TURBO    15
-#define V_TF_TURBO(x) ((x) << S_TF_TURBO)
-
-#define S_TF_KEEPALIVE    16
-#define V_TF_KEEPALIVE(x) ((x) << S_TF_KEEPALIVE)
-
-#define S_TF_TCAM_BYPASS    17
-#define V_TF_TCAM_BYPASS(x) ((x) << S_TF_TCAM_BYPASS)
-
-#define S_TF_CORE_FIN    18
-#define V_TF_CORE_FIN(x) ((x) << S_TF_CORE_FIN)
-
-#define S_TF_CORE_MORE    19
-#define V_TF_CORE_MORE(x) ((x) << S_TF_CORE_MORE)
-
-#define S_TF_MIGRATING    20
-#define V_TF_MIGRATING(x) ((x) << S_TF_MIGRATING)
-
-#define S_TF_ACTIVE_OPEN    21
-#define V_TF_ACTIVE_OPEN(x) ((x) << S_TF_ACTIVE_OPEN)
-
-#define S_TF_ASK_MODE    22
-#define V_TF_ASK_MODE(x) ((x) << S_TF_ASK_MODE)
-
-#define S_TF_NON_OFFLOAD    23
-#define V_TF_NON_OFFLOAD(x) ((x) << S_TF_NON_OFFLOAD)
-
-#define S_TF_MOD_SCHD    24
-#define V_TF_MOD_SCHD(x) ((x) << S_TF_MOD_SCHD)
-
-#define S_TF_MOD_SCHD_REASON0    25
-#define V_TF_MOD_SCHD_REASON0(x) ((x) << S_TF_MOD_SCHD_REASON0)
-
-#define S_TF_MOD_SCHD_REASON1    26
-#define V_TF_MOD_SCHD_REASON1(x) ((x) << S_TF_MOD_SCHD_REASON1)
-
-#define S_TF_MOD_SCHD_RX    27
-#define V_TF_MOD_SCHD_RX(x) ((x) << S_TF_MOD_SCHD_RX)
-
-#define S_TF_CORE_PUSH    28
-#define V_TF_CORE_PUSH(x) ((x) << S_TF_CORE_PUSH)
-
-#define S_TF_RCV_COALESCE_ENABLE    29
-#define V_TF_RCV_COALESCE_ENABLE(x) ((x) << S_TF_RCV_COALESCE_ENABLE)
-
-#define S_TF_RCV_COALESCE_PUSH    30
-#define V_TF_RCV_COALESCE_PUSH(x) ((x) << S_TF_RCV_COALESCE_PUSH)
-
-#define S_TF_RCV_COALESCE_LAST_PSH    31
-#define V_TF_RCV_COALESCE_LAST_PSH(x) ((x) << S_TF_RCV_COALESCE_LAST_PSH)
-
-#define S_TF_RCV_COALESCE_HEARTBEAT    32
-#define V_TF_RCV_COALESCE_HEARTBEAT(x) ((x) << S_TF_RCV_COALESCE_HEARTBEAT)
-
-#define S_TF_HALF_CLOSE    33
-#define V_TF_HALF_CLOSE(x) ((x) << S_TF_HALF_CLOSE)
-
-#define S_TF_DACK_MSS    34
-#define V_TF_DACK_MSS(x) ((x) << S_TF_DACK_MSS)
-
-#define S_TF_CCTRL_SEL0    35
-#define V_TF_CCTRL_SEL0(x) ((x) << S_TF_CCTRL_SEL0)
-
-#define S_TF_CCTRL_SEL1    36
-#define V_TF_CCTRL_SEL1(x) ((x) << S_TF_CCTRL_SEL1)
-
-#define S_TF_TCP_NEWRENO_FAST_RECOVERY    37
-#define V_TF_TCP_NEWRENO_FAST_RECOVERY(x) ((x) << S_TF_TCP_NEWRENO_FAST_RECOVERY)
-
-#define S_TF_TX_PACE_AUTO    38
-#define V_TF_TX_PACE_AUTO(x) ((x) << S_TF_TX_PACE_AUTO)
-
-#define S_TF_PEER_FIN_HELD    39
-#define V_TF_PEER_FIN_HELD(x) ((x) << S_TF_PEER_FIN_HELD)
-
-#define S_TF_CORE_URG    40
-#define V_TF_CORE_URG(x) ((x) << S_TF_CORE_URG)
-
-#define S_TF_RDMA_ERROR    41
-#define V_TF_RDMA_ERROR(x) ((x) << S_TF_RDMA_ERROR)
-
-#define S_TF_SSWS_DISABLED    42
-#define V_TF_SSWS_DISABLED(x) ((x) << S_TF_SSWS_DISABLED)
-
-#define S_TF_DUPACK_COUNT_ODD    43
-#define V_TF_DUPACK_COUNT_ODD(x) ((x) << S_TF_DUPACK_COUNT_ODD)
-
-#define S_TF_TX_CHANNEL    44
-#define V_TF_TX_CHANNEL(x) ((x) << S_TF_TX_CHANNEL)
-
-#define S_TF_RX_CHANNEL    45
-#define V_TF_RX_CHANNEL(x) ((x) << S_TF_RX_CHANNEL)
-
-#define S_TF_TX_PACE_FIXED    46
-#define V_TF_TX_PACE_FIXED(x) ((x) << S_TF_TX_PACE_FIXED)
-
-#define S_TF_RDMA_FLM_ERROR    47
-#define V_TF_RDMA_FLM_ERROR(x) ((x) << S_TF_RDMA_FLM_ERROR)
-
-#define S_TF_RX_FLOW_CONTROL_DISABLE    48
-#define V_TF_RX_FLOW_CONTROL_DISABLE(x) ((x) << S_TF_RX_FLOW_CONTROL_DISABLE)
-
-#endif /* _TCB_DEFS_H */
diff --git a/drivers/infiniband/hw/cxgb4/Kconfig b/drivers/infiniband/hw/cxgb4/Kconfig
index b49e8d4..9e2b2c3 100644
--- a/drivers/infiniband/hw/cxgb4/Kconfig
+++ b/drivers/infiniband/hw/cxgb4/Kconfig
@@ -5,7 +5,7 @@
 	depends on INFINIBAND_ADDR_TRANS
 	select CHELSIO_LIB
 	select GENERIC_ALLOCATOR
-	---help---
+	help
 	  This is an iWARP/RDMA driver for the Chelsio T4 and T5
 	  1GbE, 10GbE adapters and T5 40GbE adapter.
 
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 535ee41..e42c812 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -77,9 +77,9 @@
 module_param(enable_ecn, int, 0644);
 MODULE_PARM_DESC(enable_ecn, "Enable ECN (default=0/disabled)");
 
-static int dack_mode = 1;
+static int dack_mode;
 module_param(dack_mode, int, 0644);
-MODULE_PARM_DESC(dack_mode, "Delayed ack mode (default=1)");
+MODULE_PARM_DESC(dack_mode, "Delayed ack mode (default=0)");
 
 uint c4iw_max_read_depth = 32;
 module_param(c4iw_max_read_depth, int, 0644);
@@ -2885,7 +2885,7 @@
 	case MORIBUND:
 	case CLOSING:
 		stop_ep_timer(ep);
-		/*FALLTHROUGH*/
+		fallthrough;
 	case FPDU_MODE:
 		if (ep->com.qp && ep->com.qp->srq) {
 			srqidx = ABORT_RSS_SRQIDX_G(
@@ -3282,7 +3282,7 @@
 
 static int pick_local_ip6addrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id)
 {
-	struct in6_addr uninitialized_var(addr);
+	struct in6_addr addr;
 	struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&cm_id->m_local_addr;
 	struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&cm_id->m_remote_addr;
 
@@ -3760,7 +3760,7 @@
 			send_fw_act_open_req(ep, atid);
 			return;
 		}
-		/* fall through */
+		fallthrough;
 	case FW_EADDRINUSE:
 		set_bit(ACT_RETRY_INUSE, &ep->com.history);
 		if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
index 16b7459..d6cfefc 100644
--- a/drivers/infiniband/hw/cxgb4/cq.c
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -754,7 +754,7 @@
 static int __c4iw_poll_cq_one(struct c4iw_cq *chp, struct c4iw_qp *qhp,
 			      struct ib_wc *wc, struct c4iw_srq *srq)
 {
-	struct t4_cqe uninitialized_var(cqe);
+	struct t4_cqe cqe;
 	struct t4_wq *wq = qhp ? &qhp->wq : NULL;
 	u32 credit = 0;
 	u8 cqe_flushed;
@@ -967,7 +967,7 @@
 	return !err || err == -ENODATA ? npolled : err;
 }
 
-void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
 	struct c4iw_cq *chp;
 	struct c4iw_ucontext *ucontext;
@@ -985,6 +985,7 @@
 		   ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx,
 		   chp->destroy_skb, chp->wr_waitp);
 	c4iw_put_wr_wait(chp->wr_waitp);
+	return 0;
 }
 
 int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 7d06b0f..a278994 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -707,7 +707,7 @@
 	u8 flags;
 	u8 revision;
 	__be16 private_data_size;
-	u8 private_data[0];
+	u8 private_data[];
 };
 
 struct mpa_v2_conn_params {
@@ -719,7 +719,7 @@
 	u8 layer_etype;
 	u8 ecode;
 	__be16 hdrct_rsvd;
-	u8 len_hdrs[0];
+	u8 len_hdrs[];
 };
 
 #define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28)
@@ -980,26 +980,25 @@
 void c4iw_qp_add_ref(struct ib_qp *qp);
 void c4iw_qp_rem_ref(struct ib_qp *qp);
 struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			    u32 max_num_sg, struct ib_udata *udata);
+			    u32 max_num_sg);
 int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		   unsigned int *sg_offset);
 int c4iw_dealloc_mw(struct ib_mw *mw);
 void c4iw_dealloc(struct uld_ctx *ctx);
-struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
-			    struct ib_udata *udata);
+int c4iw_alloc_mw(struct ib_mw *mw, struct ib_udata *udata);
 struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
 					   u64 length, u64 virt, int acc,
 					   struct ib_udata *udata);
 struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc);
 int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
-void c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
+int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
 int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		   struct ib_udata *udata);
 int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 int c4iw_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *attr,
 		    enum ib_srq_attr_mask srq_attr_mask,
 		    struct ib_udata *udata);
-void c4iw_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata);
+int c4iw_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata);
 int c4iw_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *attrs,
 		    struct ib_udata *udata);
 int c4iw_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata);
@@ -1053,8 +1052,9 @@
 		       const struct ib_recv_wr **bad_wr);
 struct c4iw_wr_wait *c4iw_alloc_wr_wait(gfp_t gfp);
 
-typedef int c4iw_restrack_func(struct sk_buff *msg,
-			       struct rdma_restrack_entry *res);
-extern c4iw_restrack_func *c4iw_restrack_funcs[RDMA_RESTRACK_MAX];
+int c4iw_fill_res_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr);
+int c4iw_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ibcq);
+int c4iw_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ibqp);
+int c4iw_fill_res_cm_id_entry(struct sk_buff *msg, struct rdma_cm_id *cm_id);
 
 #endif
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index dcb58ce..42234df 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -510,7 +510,7 @@
 	__be64 *pages;
 	int shift, n, i;
 	int err = -ENOMEM;
-	struct sg_dma_page_iter sg_iter;
+	struct ib_block_iter biter;
 	struct c4iw_dev *rhp;
 	struct c4iw_pd *php;
 	struct c4iw_mr *mhp;
@@ -542,13 +542,13 @@
 
 	mhp->rhp = rhp;
 
-	mhp->umem = ib_umem_get(udata, start, length, acc, 0);
+	mhp->umem = ib_umem_get(pd->device, start, length, acc);
 	if (IS_ERR(mhp->umem))
 		goto err_free_skb;
 
 	shift = PAGE_SHIFT;
 
-	n = ib_umem_num_pages(mhp->umem);
+	n = ib_umem_num_dma_blocks(mhp->umem, 1 << shift);
 	err = alloc_pbl(mhp, n);
 	if (err)
 		goto err_umem_release;
@@ -561,8 +561,8 @@
 
 	i = n = 0;
 
-	for_each_sg_dma_page(mhp->umem->sg_head.sgl, &sg_iter, mhp->umem->nmap, 0) {
-		pages[i++] = cpu_to_be64(sg_page_iter_dma_address(&sg_iter));
+	rdma_umem_for_each_dma_block(mhp->umem, &biter, 1 << shift) {
+		pages[i++] = cpu_to_be64(rdma_block_iter_dma_address(&biter));
 		if (i == PAGE_SIZE / sizeof(*pages)) {
 			err = write_pbl(&mhp->rhp->rdev, pages,
 					mhp->attr.pbl_addr + (n << 3), i,
@@ -611,30 +611,23 @@
 	return ERR_PTR(err);
 }
 
-struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
-			    struct ib_udata *udata)
+int c4iw_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
 {
+	struct c4iw_mw *mhp = to_c4iw_mw(ibmw);
 	struct c4iw_dev *rhp;
 	struct c4iw_pd *php;
-	struct c4iw_mw *mhp;
 	u32 mmid;
 	u32 stag = 0;
 	int ret;
 
-	if (type != IB_MW_TYPE_1)
-		return ERR_PTR(-EINVAL);
+	if (ibmw->type != IB_MW_TYPE_1)
+		return -EINVAL;
 
-	php = to_c4iw_pd(pd);
+	php = to_c4iw_pd(ibmw->pd);
 	rhp = php->rhp;
-	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
-	if (!mhp)
-		return ERR_PTR(-ENOMEM);
-
 	mhp->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL);
-	if (!mhp->wr_waitp) {
-		ret = -ENOMEM;
-		goto free_mhp;
-	}
+	if (!mhp->wr_waitp)
+		return -ENOMEM;
 
 	mhp->dereg_skb = alloc_skb(SGE_MAX_WR_LEN, GFP_KERNEL);
 	if (!mhp->dereg_skb) {
@@ -645,18 +638,19 @@
 	ret = allocate_window(&rhp->rdev, &stag, php->pdid, mhp->wr_waitp);
 	if (ret)
 		goto free_skb;
+
 	mhp->rhp = rhp;
 	mhp->attr.pdid = php->pdid;
 	mhp->attr.type = FW_RI_STAG_MW;
 	mhp->attr.stag = stag;
 	mmid = (stag) >> 8;
-	mhp->ibmw.rkey = stag;
+	ibmw->rkey = stag;
 	if (xa_insert_irq(&rhp->mrs, mmid, mhp, GFP_KERNEL)) {
 		ret = -ENOMEM;
 		goto dealloc_win;
 	}
 	pr_debug("mmid 0x%x mhp %p stag 0x%x\n", mmid, mhp, stag);
-	return &(mhp->ibmw);
+	return 0;
 
 dealloc_win:
 	deallocate_window(&rhp->rdev, mhp->attr.stag, mhp->dereg_skb,
@@ -665,9 +659,7 @@
 	kfree_skb(mhp->dereg_skb);
 free_wr_wait:
 	c4iw_put_wr_wait(mhp->wr_waitp);
-free_mhp:
-	kfree(mhp);
-	return ERR_PTR(ret);
+	return ret;
 }
 
 int c4iw_dealloc_mw(struct ib_mw *mw)
@@ -684,13 +676,11 @@
 			  mhp->wr_waitp);
 	kfree_skb(mhp->dereg_skb);
 	c4iw_put_wr_wait(mhp->wr_waitp);
-	pr_debug("ib_mw %p mmid 0x%x ptr %p\n", mw, mmid, mhp);
-	kfree(mhp);
 	return 0;
 }
 
 struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			    u32 max_num_sg, struct ib_udata *udata)
+			    u32 max_num_sg)
 {
 	struct c4iw_dev *rhp;
 	struct c4iw_pd *php;
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index d373ac0..8138c57 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -190,7 +190,7 @@
 	return ret;
 }
 
-static void c4iw_deallocate_pd(struct ib_pd *pd, struct ib_udata *udata)
+static int c4iw_deallocate_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
 	struct c4iw_dev *rhp;
 	struct c4iw_pd *php;
@@ -202,6 +202,7 @@
 	mutex_lock(&rhp->rdev.stats.lock);
 	rhp->rdev.stats.pd.cur--;
 	mutex_unlock(&rhp->rdev.stats.lock);
+	return 0;
 }
 
 static int c4iw_allocate_pd(struct ib_pd *pd, struct ib_udata *udata)
@@ -236,14 +237,6 @@
 	return 0;
 }
 
-static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
-			   u16 *pkey)
-{
-	pr_debug("ibdev %p\n", ibdev);
-	*pkey = 0;
-	return 0;
-}
-
 static int c4iw_query_gid(struct ib_device *ibdev, u8 port, int index,
 			  union ib_gid *gid)
 {
@@ -305,7 +298,10 @@
 static int c4iw_query_port(struct ib_device *ibdev, u8 port,
 			   struct ib_port_attr *props)
 {
+	int ret = 0;
 	pr_debug("ibdev %p\n", ibdev);
+	ret = ib_get_eth_speed(ibdev, port, &props->active_speed,
+			       &props->active_width);
 
 	props->port_cap_flags =
 	    IB_PORT_CM_SUP |
@@ -314,12 +310,9 @@
 	    IB_PORT_DEVICE_MGMT_SUP |
 	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
 	props->gid_tbl_len = 1;
-	props->pkey_tbl_len = 1;
-	props->active_width = 2;
-	props->active_speed = IB_SPEED_DDR;
 	props->max_msg_sz = -1;
 
-	return 0;
+	return ret;
 }
 
 static ssize_t hw_rev_show(struct device *dev,
@@ -438,7 +431,6 @@
 	if (err)
 		return err;
 
-	immutable->pkey_tbl_len = attr.pkey_tbl_len;
 	immutable->gid_tbl_len = attr.gid_tbl_len;
 
 	return 0;
@@ -457,13 +449,6 @@
 		 FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers));
 }
 
-static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res)
-{
-	return (res->type < ARRAY_SIZE(c4iw_restrack_funcs) &&
-		c4iw_restrack_funcs[res->type]) ?
-		c4iw_restrack_funcs[res->type](msg, res) : 0;
-}
-
 static const struct ib_device_ops c4iw_dev_ops = {
 	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_CXGB4,
@@ -484,7 +469,9 @@
 	.destroy_cq = c4iw_destroy_cq,
 	.destroy_qp = c4iw_destroy_qp,
 	.destroy_srq = c4iw_destroy_srq,
-	.fill_res_entry = fill_res_entry,
+	.fill_res_cq_entry = c4iw_fill_res_cq_entry,
+	.fill_res_cm_id_entry = c4iw_fill_res_cm_id_entry,
+	.fill_res_mr_entry = c4iw_fill_res_mr_entry,
 	.get_dev_fw_str = get_dev_fw_str,
 	.get_dma_mr = c4iw_get_dma_mr,
 	.get_hw_stats = c4iw_get_mib,
@@ -507,13 +494,14 @@
 	.post_srq_recv = c4iw_post_srq_recv,
 	.query_device = c4iw_query_device,
 	.query_gid = c4iw_query_gid,
-	.query_pkey = c4iw_query_pkey,
 	.query_port = c4iw_query_port,
 	.query_qp = c4iw_ib_query_qp,
 	.reg_user_mr = c4iw_reg_user_mr,
 	.req_notify_cq = c4iw_arm_cq,
-	INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd),
+
 	INIT_RDMA_OBJ_SIZE(ib_cq, c4iw_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_mw, c4iw_mw, ibmw),
+	INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd),
 	INIT_RDMA_OBJ_SIZE(ib_srq, c4iw_srq, ibsrq),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, c4iw_ucontext, ibucontext),
 };
@@ -582,7 +570,9 @@
 	ret = set_netdevs(&dev->ibdev, &dev->rdev);
 	if (ret)
 		goto err_dealloc_ctx;
-	ret = ib_register_device(&dev->ibdev, "cxgb4_%d");
+	dma_set_max_seg_size(&dev->rdev.lldi.pdev->dev, UINT_MAX);
+	ret = ib_register_device(&dev->ibdev, "cxgb4_%d",
+				 &dev->rdev.lldi.pdev->dev);
 	if (ret)
 		goto err_dealloc_ctx;
 	return;
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 3ac08f4..12e5461 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -1166,7 +1166,7 @@
 				break;
 			}
 			fw_flags |= FW_RI_RDMA_WRITE_WITH_IMMEDIATE;
-			/*FALLTHROUGH*/
+			fallthrough;
 		case IB_WR_RDMA_WRITE:
 			fw_opcode = FW_RI_RDMA_WRITE_WR;
 			swsqe->opcode = FW_RI_RDMA_WRITE;
@@ -2128,7 +2128,7 @@
 	pr_debug("ib_pd %p\n", pd);
 
 	if (attrs->qp_type != IB_QPT_RC)
-		return ERR_PTR(-EINVAL);
+		return ERR_PTR(-EOPNOTSUPP);
 
 	php = to_c4iw_pd(pd);
 	rhp = php->rhp;
@@ -2469,6 +2469,7 @@
 	memset(attr, 0, sizeof(*attr));
 	memset(init_attr, 0, sizeof(*init_attr));
 	attr->qp_state = to_ib_qp_state(qhp->attr.state);
+	attr->cur_qp_state = to_ib_qp_state(qhp->attr.state);
 	init_attr->cap.max_send_wr = qhp->attr.sq_num_entries;
 	init_attr->cap.max_recv_wr = qhp->attr.rq_num_entries;
 	init_attr->cap.max_send_sge = qhp->attr.sq_max_sges;
@@ -2798,7 +2799,7 @@
 	return ret;
 }
 
-void c4iw_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
+int c4iw_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
 	struct c4iw_dev *rhp;
 	struct c4iw_srq *srq;
@@ -2814,4 +2815,5 @@
 		       srq->wr_waitp);
 	c4iw_free_srq_idx(&rhp->rdev, srq->idx);
 	c4iw_put_wr_wait(srq->wr_waitp);
+	return 0;
 }
diff --git a/drivers/infiniband/hw/cxgb4/restrack.c b/drivers/infiniband/hw/cxgb4/restrack.c
index f82d46e..b32e651 100644
--- a/drivers/infiniband/hw/cxgb4/restrack.c
+++ b/drivers/infiniband/hw/cxgb4/restrack.c
@@ -134,10 +134,8 @@
 	return -EMSGSIZE;
 }
 
-static int fill_res_qp_entry(struct sk_buff *msg,
-			     struct rdma_restrack_entry *res)
+int c4iw_fill_res_qp_entry(struct sk_buff *msg, struct ib_qp *ibqp)
 {
-	struct ib_qp *ibqp = container_of(res, struct ib_qp, res);
 	struct t4_swsqe *fsp = NULL, *lsp = NULL;
 	struct c4iw_qp *qhp = to_c4iw_qp(ibqp);
 	u16 first_sq_idx = 0, last_sq_idx = 0;
@@ -195,10 +193,9 @@
 	struct c4iw_ep ep;
 };
 
-static int fill_res_ep_entry(struct sk_buff *msg,
-			     struct rdma_restrack_entry *res)
+int c4iw_fill_res_cm_id_entry(struct sk_buff *msg,
+			      struct rdma_cm_id *cm_id)
 {
-	struct rdma_cm_id *cm_id = rdma_res_to_id(res);
 	struct nlattr *table_attr;
 	struct c4iw_ep_common *epcp;
 	struct c4iw_listen_ep *listen_ep = NULL;
@@ -372,10 +369,8 @@
 	return -EMSGSIZE;
 }
 
-static int fill_res_cq_entry(struct sk_buff *msg,
-			     struct rdma_restrack_entry *res)
+int c4iw_fill_res_cq_entry(struct sk_buff *msg, struct ib_cq *ibcq)
 {
-	struct ib_cq *ibcq = container_of(res, struct ib_cq, res);
 	struct c4iw_cq *chp = to_c4iw_cq(ibcq);
 	struct nlattr *table_attr;
 	struct t4_cqe hwcqes[2];
@@ -433,10 +428,8 @@
 	return -EMSGSIZE;
 }
 
-static int fill_res_mr_entry(struct sk_buff *msg,
-			     struct rdma_restrack_entry *res)
+int c4iw_fill_res_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr)
 {
-	struct ib_mr *ibmr = container_of(res, struct ib_mr, res);
 	struct c4iw_mr *mhp = to_c4iw_mr(ibmr);
 	struct c4iw_dev *dev = mhp->rhp;
 	u32 stag = mhp->attr.stag;
@@ -492,10 +485,3 @@
 err:
 	return -EMSGSIZE;
 }
-
-c4iw_restrack_func *c4iw_restrack_funcs[RDMA_RESTRACK_MAX] = {
-	[RDMA_RESTRACK_QP]	= fill_res_qp_entry,
-	[RDMA_RESTRACK_CM_ID]	= fill_res_ep_entry,
-	[RDMA_RESTRACK_CQ]	= fill_res_cq_entry,
-	[RDMA_RESTRACK_MR]	= fill_res_mr_entry,
-};
diff --git a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h
index cbdb300..a2f5e29 100644
--- a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h
+++ b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h
@@ -123,7 +123,7 @@
 	__be32	len0;
 	__be64	addr0;
 #ifndef C99_NOT_SUPPORTED
-	struct fw_ri_dsge_pair sge[0];
+	struct fw_ri_dsge_pair sge[];
 #endif
 };
 
@@ -139,7 +139,7 @@
 	__be16	nsge;
 	__be32	r2;
 #ifndef C99_NOT_SUPPORTED
-	struct fw_ri_sge sge[0];
+	struct fw_ri_sge sge[];
 #endif
 };
 
@@ -149,7 +149,7 @@
 	__be16	r2;
 	__be32	immdlen;
 #ifndef C99_NOT_SUPPORTED
-	__u8	data[0];
+	__u8	data[];
 #endif
 };
 
@@ -321,7 +321,7 @@
 	__be32 len16_pkd;
 	__u64  cookie;
 #ifndef C99_NOT_SUPPORTED
-	struct fw_ri_res res[0];
+	struct fw_ri_res res[];
 #endif
 };
 
diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
index 2283e43..e5d9712 100644
--- a/drivers/infiniband/hw/efa/efa.h
+++ b/drivers/infiniband/hw/efa/efa.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_H_
@@ -33,18 +33,15 @@
 	char name[EFA_IRQNAME_SIZE];
 };
 
-struct efa_sw_stats {
+/* Don't use anything other than atomic64 */
+struct efa_stats {
 	atomic64_t alloc_pd_err;
 	atomic64_t create_qp_err;
 	atomic64_t create_cq_err;
 	atomic64_t reg_mr_err;
 	atomic64_t alloc_ucontext_err;
 	atomic64_t create_ah_err;
-};
-
-/* Don't use anything other than atomic64 */
-struct efa_stats {
-	struct efa_sw_stats sw_stats;
+	atomic64_t mmap_err;
 	atomic64_t keep_alive_rcvd;
 };
 
@@ -60,8 +57,6 @@
 	u64 mem_bar_len;
 	u64 db_bar_addr;
 	u64 db_bar_len;
-	u8 addr[EFA_GID_SIZE];
-	u32 mtu;
 
 	int admin_msix_vector_idx;
 	struct efa_irq admin_irq;
@@ -71,8 +66,6 @@
 
 struct efa_ucontext {
 	struct ib_ucontext ibucontext;
-	struct xarray mmap_xa;
-	u32 mmap_xa_page;
 	u16 uarn;
 };
 
@@ -91,6 +84,7 @@
 	struct efa_ucontext *ucontext;
 	dma_addr_t dma_addr;
 	void *cpu_addr;
+	struct rdma_user_mmap_entry *mmap_entry;
 	size_t size;
 	u16 cq_idx;
 };
@@ -101,6 +95,13 @@
 	void *rq_cpu_addr;
 	size_t rq_size;
 	enum ib_qp_state state;
+
+	/* Used for saving mmap_xa entries */
+	struct rdma_user_mmap_entry *sq_db_mmap_entry;
+	struct rdma_user_mmap_entry *llq_desc_mmap_entry;
+	struct rdma_user_mmap_entry *rq_db_mmap_entry;
+	struct rdma_user_mmap_entry *rq_mmap_entry;
+
 	u32 qp_handle;
 	u32 max_send_wr;
 	u32 max_recv_wr;
@@ -129,12 +130,12 @@
 int efa_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
 		   u16 *pkey);
 int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
-void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+int efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
 int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
 struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
 			    struct ib_qp_init_attr *init_attr,
 			    struct ib_udata *udata);
-void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		  struct ib_udata *udata);
 struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
@@ -147,11 +148,11 @@
 void efa_dealloc_ucontext(struct ib_ucontext *ibucontext);
 int efa_mmap(struct ib_ucontext *ibucontext,
 	     struct vm_area_struct *vma);
+void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry);
 int efa_create_ah(struct ib_ah *ibah,
-		  struct rdma_ah_attr *ah_attr,
-		  u32 flags,
+		  struct rdma_ah_init_attr *init_attr,
 		  struct ib_udata *udata);
-void efa_destroy_ah(struct ib_ah *ibah, u32 flags);
+int efa_destroy_ah(struct ib_ah *ibah, u32 flags);
 int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
 		  int qp_attr_mask, struct ib_udata *udata);
 enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
index 2be0469..b199e4a 100644
--- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
+++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_ADMIN_CMDS_H_
@@ -37,7 +37,7 @@
 	EFA_ADMIN_NETWORK_ATTR                      = 3,
 	EFA_ADMIN_QUEUE_ATTR                        = 4,
 	EFA_ADMIN_HW_HINTS                          = 5,
-	EFA_ADMIN_FEATURES_OPCODE_NUM               = 8,
+	EFA_ADMIN_HOST_INFO                         = 6,
 };
 
 /* QP transport type */
@@ -61,6 +61,8 @@
 
 enum efa_admin_get_stats_type {
 	EFA_ADMIN_GET_STATS_TYPE_BASIC              = 0,
+	EFA_ADMIN_GET_STATS_TYPE_MESSAGES           = 1,
+	EFA_ADMIN_GET_STATS_TYPE_RDMA_READ          = 2,
 };
 
 enum efa_admin_get_stats_scope {
@@ -68,14 +70,6 @@
 	EFA_ADMIN_GET_STATS_SCOPE_QUEUE             = 1,
 };
 
-enum efa_admin_modify_qp_mask_bits {
-	EFA_ADMIN_QP_STATE_BIT                      = 0,
-	EFA_ADMIN_CUR_QP_STATE_BIT                  = 1,
-	EFA_ADMIN_QKEY_BIT                          = 2,
-	EFA_ADMIN_SQ_PSN_BIT                        = 3,
-	EFA_ADMIN_SQ_DRAINED_ASYNC_NOTIFY_BIT       = 4,
-};
-
 /*
  * QP allocation sizes, converted by fabric QueuePair (QP) create command
  * from QP capabilities.
@@ -160,10 +154,16 @@
 	/* Common Admin Queue completion descriptor */
 	struct efa_admin_acq_common_desc acq_common_desc;
 
-	/* Opaque handle to be used for consequent operations on the QP */
+	/*
+	 * Opaque handle to be used for consequent admin operations on the
+	 * QP
+	 */
 	u32 qp_handle;
 
-	/* QP number in the given EFA virtual device */
+	/*
+	 * QP number in the given EFA virtual device. Least-significant bits
+	 *    (as needed according to max_qp) carry unique QP ID
+	 */
 	u16 qp_num;
 
 	/* MBZ */
@@ -193,8 +193,14 @@
 	struct efa_admin_aq_common_desc aq_common_desc;
 
 	/*
-	 * Mask indicating which fields should be updated see enum
-	 * efa_admin_modify_qp_mask_bits
+	 * Mask indicating which fields should be updated
+	 * 0 : qp_state
+	 * 1 : cur_qp_state
+	 * 2 : qkey
+	 * 3 : sq_psn
+	 * 4 : sq_drained_async_notify
+	 * 5 : rnr_retry
+	 * 31:6 : reserved
 	 */
 	u32 modify_mask;
 
@@ -216,8 +222,8 @@
 	/* Enable async notification when SQ is drained */
 	u8 sq_drained_async_notify;
 
-	/* MBZ */
-	u8 reserved1;
+	/* Number of RNR retries (valid only for SRD QPs) */
+	u8 rnr_retry;
 
 	/* MBZ */
 	u16 reserved2;
@@ -252,8 +258,8 @@
 	/* Indicates that draining is in progress */
 	u8 sq_draining;
 
-	/* MBZ */
-	u8 reserved1;
+	/* Number of RNR retries (valid only for SRD QPs) */
+	u8 rnr_retry;
 
 	/* MBZ */
 	u16 reserved2;
@@ -286,6 +292,7 @@
 	/* PD number */
 	u16 pd;
 
+	/* MBZ */
 	u16 reserved;
 };
 
@@ -296,6 +303,7 @@
 	/* Target interface address handle (opaque) */
 	u16 ah;
 
+	/* MBZ */
 	u16 reserved;
 };
 
@@ -362,12 +370,17 @@
 
 	/*
 	 * permissions
-	 * 0 : local_write_enable - Write permissions: value
-	 *    of 1 needed for RQ buffers and for RDMA write
-	 * 7:1 : reserved1 - remote access flags, etc
+	 * 0 : local_write_enable - Local write permissions:
+	 *    must be set for RQ buffers and buffers posted for
+	 *    RDMA Read requests
+	 * 1 : reserved1 - MBZ
+	 * 2 : remote_read_enable - Remote read permissions:
+	 *    must be set to enable RDMA read from the region
+	 * 7:3 : reserved2 - MBZ
 	 */
 	u8 permissions;
 
+	/* MBZ */
 	u16 reserved16_w5;
 
 	/* number of pages in PBL (redundant, could be calculated) */
@@ -415,20 +428,20 @@
 	struct efa_admin_aq_common_desc aq_common_desc;
 
 	/*
-	 * 4:0 : reserved5
+	 * 4:0 : reserved5 - MBZ
 	 * 5 : interrupt_mode_enabled - if set, cq operates
 	 *    in interrupt mode (i.e. CQ events and MSI-X are
 	 *    generated), otherwise - polling
 	 * 6 : virt - If set, ring base address is virtual
 	 *    (IOVA returned by MR registration)
-	 * 7 : reserved6
+	 * 7 : reserved6 - MBZ
 	 */
 	u8 cq_caps_1;
 
 	/*
 	 * 4:0 : cq_entry_size_words - size of CQ entry in
 	 *    32-bit words, valid values: 4, 8.
-	 * 7:5 : reserved7
+	 * 7:5 : reserved7 - MBZ
 	 */
 	u8 cq_caps_2;
 
@@ -474,6 +487,7 @@
 
 	u16 cq_idx;
 
+	/* MBZ */
 	u16 reserved1;
 };
 
@@ -516,17 +530,43 @@
 	u64 rx_drops;
 };
 
+struct efa_admin_messages_stats {
+	u64 send_bytes;
+
+	u64 send_wrs;
+
+	u64 recv_bytes;
+
+	u64 recv_wrs;
+};
+
+struct efa_admin_rdma_read_stats {
+	u64 read_wrs;
+
+	u64 read_bytes;
+
+	u64 read_wr_err;
+
+	u64 read_resp_bytes;
+};
+
 struct efa_admin_acq_get_stats_resp {
 	struct efa_admin_acq_common_desc acq_common_desc;
 
-	struct efa_admin_basic_stats basic_stats;
+	union {
+		struct efa_admin_basic_stats basic_stats;
+
+		struct efa_admin_messages_stats messages_stats;
+
+		struct efa_admin_rdma_read_stats rdma_read_stats;
+	} u;
 };
 
 struct efa_admin_get_set_feature_common_desc {
 	/*
 	 * 1:0 : select - 0x1 - current value; 0x3 - default
 	 *    value
-	 * 7:3 : reserved3
+	 * 7:3 : reserved3 - MBZ
 	 */
 	u8 flags;
 
@@ -553,38 +593,51 @@
 	/* Bar used for SQ and RQ doorbells */
 	u16 db_bar;
 
-	/* Indicates how many bits are used physical address access */
+	/* Indicates how many bits are used on physical address access */
 	u8 phys_addr_width;
 
-	/* Indicates how many bits are used virtual address access */
+	/* Indicates how many bits are used on virtual address access */
 	u8 virt_addr_width;
+
+	/*
+	 * 0 : rdma_read - If set, RDMA Read is supported on
+	 *    TX queues
+	 * 1 : rnr_retry - If set, RNR retry is supported on
+	 *    modify QP command
+	 * 31:2 : reserved - MBZ
+	 */
+	u32 device_caps;
+
+	/* Max RDMA transfer size in bytes */
+	u32 max_rdma_size;
 };
 
 struct efa_admin_feature_queue_attr_desc {
 	/* The maximum number of queue pairs supported */
 	u32 max_qp;
 
+	/* Maximum number of WQEs per Send Queue */
 	u32 max_sq_depth;
 
-	/* max send wr used in inline-buf */
+	/* Maximum size of data that can be sent inline in a Send WQE */
 	u32 inline_buf_size;
 
+	/* Maximum number of buffer descriptors per Recv Queue */
 	u32 max_rq_depth;
 
 	/* The maximum number of completion queues supported per VF */
 	u32 max_cq;
 
+	/* Maximum number of CQEs per Completion Queue */
 	u32 max_cq_depth;
 
 	/* Number of sub-CQs to be created for each CQ */
 	u16 sub_cqs_per_cq;
 
-	u16 reserved;
+	/* Minimum number of WQEs per SQ */
+	u16 min_sq_depth;
 
-	/*
-	 * Maximum number of SGEs (buffs) allowed for a single send work
-	 *    queue element (WQE)
-	 */
+	/* Maximum number of SGEs (buffers) allowed for a single send WQE */
 	u16 max_wr_send_sges;
 
 	/* Maximum number of SGEs allowed for a single recv WQE */
@@ -604,6 +657,20 @@
 
 	/* The maximum size of LLQ in bytes */
 	u32 max_llq_size;
+
+	/* Maximum number of SGEs for a single RDMA read WQE */
+	u16 max_wr_rdma_sges;
+
+	/*
+	 * Maximum number of bytes that can be written to SQ between two
+	 * consecutive doorbells (in units of 64B). Driver must ensure that only
+	 * complete WQEs are written to queue before issuing a doorbell.
+	 * Examples: max_tx_batch=16 and WQE size = 64B, means up to 16 WQEs can
+	 * be written to SQ between two consecutive doorbells. max_tx_batch=11
+	 * and WQE size = 128B, means up to 5 WQEs can be written to SQ between
+	 * two consecutive doorbells. Zero means unlimited.
+	 */
+	u16 max_tx_batch;
 };
 
 struct efa_admin_feature_aenq_desc {
@@ -618,6 +685,7 @@
 	/* Raw address data in network byte order */
 	u8 addr[16];
 
+	/* max packet payload size in bytes */
 	u32 mtu;
 };
 
@@ -770,25 +838,95 @@
 	u32 reg_val;
 };
 
+enum efa_admin_os_type {
+	EFA_ADMIN_OS_LINUX                          = 0,
+};
+
+struct efa_admin_host_info {
+	/* OS distribution string format */
+	u8 os_dist_str[128];
+
+	/* Defined in enum efa_admin_os_type */
+	u32 os_type;
+
+	/* Kernel version string format */
+	u8 kernel_ver_str[32];
+
+	/* Kernel version numeric format */
+	u32 kernel_ver;
+
+	/*
+	 * 7:0 : driver_module_type
+	 * 15:8 : driver_sub_minor
+	 * 23:16 : driver_minor
+	 * 31:24 : driver_major
+	 */
+	u32 driver_ver;
+
+	/*
+	 * Device's Bus, Device and Function
+	 * 2:0 : function
+	 * 7:3 : device
+	 * 15:8 : bus
+	 */
+	u16 bdf;
+
+	/*
+	 * Spec version
+	 * 7:0 : spec_minor
+	 * 15:8 : spec_major
+	 */
+	u16 spec_ver;
+
+	/*
+	 * 0 : intree - Intree driver
+	 * 1 : gdr - GPUDirect RDMA supported
+	 * 31:2 : reserved2
+	 */
+	u32 flags;
+};
+
 /* create_qp_cmd */
 #define EFA_ADMIN_CREATE_QP_CMD_SQ_VIRT_MASK                BIT(0)
-#define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_SHIFT               1
 #define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_MASK                BIT(1)
 
+/* modify_qp_cmd */
+#define EFA_ADMIN_MODIFY_QP_CMD_QP_STATE_MASK               BIT(0)
+#define EFA_ADMIN_MODIFY_QP_CMD_CUR_QP_STATE_MASK           BIT(1)
+#define EFA_ADMIN_MODIFY_QP_CMD_QKEY_MASK                   BIT(2)
+#define EFA_ADMIN_MODIFY_QP_CMD_SQ_PSN_MASK                 BIT(3)
+#define EFA_ADMIN_MODIFY_QP_CMD_SQ_DRAINED_ASYNC_NOTIFY_MASK BIT(4)
+#define EFA_ADMIN_MODIFY_QP_CMD_RNR_RETRY_MASK              BIT(5)
+
 /* reg_mr_cmd */
 #define EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT_MASK      GENMASK(4, 0)
-#define EFA_ADMIN_REG_MR_CMD_MEM_ADDR_PHY_MODE_EN_SHIFT     7
 #define EFA_ADMIN_REG_MR_CMD_MEM_ADDR_PHY_MODE_EN_MASK      BIT(7)
 #define EFA_ADMIN_REG_MR_CMD_LOCAL_WRITE_ENABLE_MASK        BIT(0)
+#define EFA_ADMIN_REG_MR_CMD_REMOTE_READ_ENABLE_MASK        BIT(2)
 
 /* create_cq_cmd */
-#define EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_SHIFT 5
 #define EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK BIT(5)
-#define EFA_ADMIN_CREATE_CQ_CMD_VIRT_SHIFT                  6
 #define EFA_ADMIN_CREATE_CQ_CMD_VIRT_MASK                   BIT(6)
 #define EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK    GENMASK(4, 0)
 
 /* get_set_feature_common_desc */
 #define EFA_ADMIN_GET_SET_FEATURE_COMMON_DESC_SELECT_MASK   GENMASK(1, 0)
 
+/* feature_device_attr_desc */
+#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RDMA_READ_MASK   BIT(0)
+#define EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_RNR_RETRY_MASK   BIT(1)
+
+/* host_info */
+#define EFA_ADMIN_HOST_INFO_DRIVER_MODULE_TYPE_MASK         GENMASK(7, 0)
+#define EFA_ADMIN_HOST_INFO_DRIVER_SUB_MINOR_MASK           GENMASK(15, 8)
+#define EFA_ADMIN_HOST_INFO_DRIVER_MINOR_MASK               GENMASK(23, 16)
+#define EFA_ADMIN_HOST_INFO_DRIVER_MAJOR_MASK               GENMASK(31, 24)
+#define EFA_ADMIN_HOST_INFO_FUNCTION_MASK                   GENMASK(2, 0)
+#define EFA_ADMIN_HOST_INFO_DEVICE_MASK                     GENMASK(7, 3)
+#define EFA_ADMIN_HOST_INFO_BUS_MASK                        GENMASK(15, 8)
+#define EFA_ADMIN_HOST_INFO_SPEC_MINOR_MASK                 GENMASK(7, 0)
+#define EFA_ADMIN_HOST_INFO_SPEC_MAJOR_MASK                 GENMASK(15, 8)
+#define EFA_ADMIN_HOST_INFO_INTREE_MASK                     BIT(0)
+#define EFA_ADMIN_HOST_INFO_GDR_MASK                        BIT(1)
+
 #endif /* _EFA_ADMIN_CMDS_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_admin_defs.h b/drivers/infiniband/hw/efa/efa_admin_defs.h
index c8e0c8b..29d53ed 100644
--- a/drivers/infiniband/hw/efa/efa_admin_defs.h
+++ b/drivers/infiniband/hw/efa/efa_admin_defs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_ADMIN_H_
@@ -121,9 +121,7 @@
 /* aq_common_desc */
 #define EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK            GENMASK(11, 0)
 #define EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK                 BIT(0)
-#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_SHIFT            1
 #define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK             BIT(1)
-#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_SHIFT   2
 #define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK    BIT(2)
 
 /* acq_common_desc */
diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c
index 0778f4f..336bc2c 100644
--- a/drivers/infiniband/hw/efa/efa_com.c
+++ b/drivers/infiniband/hw/efa/efa_com.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "efa_com.h"
@@ -16,26 +16,13 @@
 #define EFA_ASYNC_QUEUE_DEPTH 16
 #define EFA_ADMIN_QUEUE_DEPTH 32
 
-#define MIN_EFA_VER\
-	((EFA_ADMIN_API_VERSION_MAJOR << EFA_REGS_VERSION_MAJOR_VERSION_SHIFT) | \
-	 (EFA_ADMIN_API_VERSION_MINOR & EFA_REGS_VERSION_MINOR_VERSION_MASK))
-
 #define EFA_CTRL_MAJOR          0
 #define EFA_CTRL_MINOR          0
 #define EFA_CTRL_SUB_MINOR      1
 
-#define MIN_EFA_CTRL_VER \
-	(((EFA_CTRL_MAJOR) << \
-	(EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT)) | \
-	((EFA_CTRL_MINOR) << \
-	(EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT)) | \
-	(EFA_CTRL_SUB_MINOR))
-
 #define EFA_DMA_ADDR_TO_UINT32_LOW(x)   ((u32)((u64)(x)))
 #define EFA_DMA_ADDR_TO_UINT32_HIGH(x)  ((u32)(((u64)(x)) >> 32))
 
-#define EFA_REGS_ADMIN_INTR_MASK 1
-
 enum efa_cmd_status {
 	EFA_CMD_SUBMITTED,
 	EFA_CMD_COMPLETED,
@@ -84,7 +71,7 @@
 	struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
 	struct efa_admin_mmio_req_read_less_resp *read_resp;
 	unsigned long exp_time;
-	u32 mmio_read_reg;
+	u32 mmio_read_reg = 0;
 	u32 err;
 
 	read_resp = mmio_read->read_resp;
@@ -94,10 +81,9 @@
 
 	/* trash DMA req_id to identify when hardware is done */
 	read_resp->req_id = mmio_read->seq_num + 0x9aL;
-	mmio_read_reg = (offset << EFA_REGS_MMIO_REG_READ_REG_OFF_SHIFT) &
-			EFA_REGS_MMIO_REG_READ_REG_OFF_MASK;
-	mmio_read_reg |= mmio_read->seq_num &
-			 EFA_REGS_MMIO_REG_READ_REQ_ID_MASK;
+	EFA_SET(&mmio_read_reg, EFA_REGS_MMIO_REG_READ_REG_OFF, offset);
+	EFA_SET(&mmio_read_reg, EFA_REGS_MMIO_REG_READ_REQ_ID,
+		mmio_read->seq_num);
 
 	writel(mmio_read_reg, edev->reg_bar + EFA_REGS_MMIO_REG_READ_OFF);
 
@@ -137,9 +123,9 @@
 	struct efa_com_admin_queue *aq = &edev->aq;
 	struct efa_com_admin_sq *sq = &aq->sq;
 	u16 size = aq->depth * sizeof(*sq->entries);
+	u32 aq_caps = 0;
 	u32 addr_high;
 	u32 addr_low;
-	u32 aq_caps;
 
 	sq->entries =
 		dma_alloc_coherent(aq->dmadev, size, &sq->dma_addr, GFP_KERNEL);
@@ -160,10 +146,9 @@
 	writel(addr_low, edev->reg_bar + EFA_REGS_AQ_BASE_LO_OFF);
 	writel(addr_high, edev->reg_bar + EFA_REGS_AQ_BASE_HI_OFF);
 
-	aq_caps = aq->depth & EFA_REGS_AQ_CAPS_AQ_DEPTH_MASK;
-	aq_caps |= (sizeof(struct efa_admin_aq_entry) <<
-			EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT) &
-			EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK;
+	EFA_SET(&aq_caps, EFA_REGS_AQ_CAPS_AQ_DEPTH, aq->depth);
+	EFA_SET(&aq_caps, EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE,
+		sizeof(struct efa_admin_aq_entry));
 
 	writel(aq_caps, edev->reg_bar + EFA_REGS_AQ_CAPS_OFF);
 
@@ -175,9 +160,9 @@
 	struct efa_com_admin_queue *aq = &edev->aq;
 	struct efa_com_admin_cq *cq = &aq->cq;
 	u16 size = aq->depth * sizeof(*cq->entries);
+	u32 acq_caps = 0;
 	u32 addr_high;
 	u32 addr_low;
-	u32 acq_caps;
 
 	cq->entries =
 		dma_alloc_coherent(aq->dmadev, size, &cq->dma_addr, GFP_KERNEL);
@@ -195,13 +180,11 @@
 	writel(addr_low, edev->reg_bar + EFA_REGS_ACQ_BASE_LO_OFF);
 	writel(addr_high, edev->reg_bar + EFA_REGS_ACQ_BASE_HI_OFF);
 
-	acq_caps = aq->depth & EFA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK;
-	acq_caps |= (sizeof(struct efa_admin_acq_entry) <<
-			EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT) &
-			EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK;
-	acq_caps |= (aq->msix_vector_idx <<
-			EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_SHIFT) &
-			EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_MASK;
+	EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_DEPTH, aq->depth);
+	EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE,
+		sizeof(struct efa_admin_acq_entry));
+	EFA_SET(&acq_caps, EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR,
+		aq->msix_vector_idx);
 
 	writel(acq_caps, edev->reg_bar + EFA_REGS_ACQ_CAPS_OFF);
 
@@ -212,7 +195,8 @@
 				   struct efa_aenq_handlers *aenq_handlers)
 {
 	struct efa_com_aenq *aenq = &edev->aenq;
-	u32 addr_low, addr_high, aenq_caps;
+	u32 addr_low, addr_high;
+	u32 aenq_caps = 0;
 	u16 size;
 
 	if (!aenq_handlers) {
@@ -237,13 +221,11 @@
 	writel(addr_low, edev->reg_bar + EFA_REGS_AENQ_BASE_LO_OFF);
 	writel(addr_high, edev->reg_bar + EFA_REGS_AENQ_BASE_HI_OFF);
 
-	aenq_caps = aenq->depth & EFA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK;
-	aenq_caps |= (sizeof(struct efa_admin_aenq_entry) <<
-		EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT) &
-		EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK;
-	aenq_caps |= (aenq->msix_vector_idx
-		      << EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_SHIFT) &
-		     EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_MASK;
+	EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_DEPTH, aenq->depth);
+	EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE,
+		sizeof(struct efa_admin_aenq_entry));
+	EFA_SET(&aenq_caps, EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR,
+		aenq->msix_vector_idx);
 	writel(aenq_caps, edev->reg_bar + EFA_REGS_AENQ_CAPS_OFF);
 
 	/*
@@ -280,8 +262,8 @@
 static inline void efa_com_put_comp_ctx(struct efa_com_admin_queue *aq,
 					struct efa_comp_ctx *comp_ctx)
 {
-	u16 cmd_id = comp_ctx->user_cqe->acq_common_descriptor.command &
-		     EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK;
+	u16 cmd_id = EFA_GET(&comp_ctx->user_cqe->acq_common_descriptor.command,
+			     EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID);
 	u16 ctx_id = cmd_id & (aq->depth - 1);
 
 	ibdev_dbg(aq->efa_dev, "Put completion command_id %#x\n", cmd_id);
@@ -335,8 +317,8 @@
 	cmd_id &= EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK;
 
 	cmd->aq_common_descriptor.command_id = cmd_id;
-	cmd->aq_common_descriptor.flags |= aq->sq.phase &
-		EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK;
+	EFA_SET(&cmd->aq_common_descriptor.flags,
+		EFA_ADMIN_AQ_COMMON_DESC_PHASE, aq->sq.phase);
 
 	comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, true);
 	if (!comp_ctx) {
@@ -427,8 +409,8 @@
 	struct efa_comp_ctx *comp_ctx;
 	u16 cmd_id;
 
-	cmd_id = cqe->acq_common_descriptor.command &
-		 EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK;
+	cmd_id = EFA_GET(&cqe->acq_common_descriptor.command,
+			 EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID);
 
 	comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false);
 	if (!comp_ctx) {
@@ -649,17 +631,20 @@
 			cmd->aq_common_descriptor.opcode, PTR_ERR(comp_ctx));
 
 		up(&aq->avail_cmds);
+		atomic64_inc(&aq->stats.cmd_err);
 		return PTR_ERR(comp_ctx);
 	}
 
 	err = efa_com_wait_and_process_admin_cq(comp_ctx, aq);
-	if (err)
+	if (err) {
 		ibdev_err_ratelimited(
 			aq->efa_dev,
 			"Failed to process command %s (opcode %u) comp_status %d err %d\n",
 			efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
 			cmd->aq_common_descriptor.opcode, comp_ctx->comp_status,
 			err);
+		atomic64_inc(&aq->stats.cmd_err);
+	}
 
 	up(&aq->avail_cmds);
 
@@ -705,7 +690,7 @@
 	u32 mask_value = 0;
 
 	if (polling)
-		mask_value = EFA_REGS_ADMIN_INTR_MASK;
+		EFA_SET(&mask_value, EFA_REGS_INTR_MASK_EN, 1);
 
 	writel(mask_value, edev->reg_bar + EFA_REGS_INTR_MASK_OFF);
 	if (polling)
@@ -743,7 +728,7 @@
 	int err;
 
 	dev_sts = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
-	if (!(dev_sts & EFA_REGS_DEV_STS_READY_MASK)) {
+	if (!EFA_GET(&dev_sts, EFA_REGS_DEV_STS_READY)) {
 		ibdev_err(edev->efa_dev,
 			  "Device isn't ready, abort com init %#x\n", dev_sts);
 		return -ENODEV;
@@ -778,8 +763,7 @@
 		goto err_destroy_cq;
 
 	cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
-	timeout = (cap & EFA_REGS_CAPS_ADMIN_CMD_TO_MASK) >>
-		  EFA_REGS_CAPS_ADMIN_CMD_TO_SHIFT;
+	timeout = EFA_GET(&cap, EFA_REGS_CAPS_ADMIN_CMD_TO);
 	if (timeout)
 		/* the resolution of timeout reg is 100ms */
 		aq->completion_timeout = timeout * 100000;
@@ -940,7 +924,9 @@
 
 int efa_com_validate_version(struct efa_com_dev *edev)
 {
+	u32 min_ctrl_ver = 0;
 	u32 ctrl_ver_masked;
+	u32 min_ver = 0;
 	u32 ctrl_ver;
 	u32 ver;
 
@@ -953,33 +939,42 @@
 				      EFA_REGS_CONTROLLER_VERSION_OFF);
 
 	ibdev_dbg(edev->efa_dev, "efa device version: %d.%d\n",
-		  (ver & EFA_REGS_VERSION_MAJOR_VERSION_MASK) >>
-			  EFA_REGS_VERSION_MAJOR_VERSION_SHIFT,
-		  ver & EFA_REGS_VERSION_MINOR_VERSION_MASK);
+		  EFA_GET(&ver, EFA_REGS_VERSION_MAJOR_VERSION),
+		  EFA_GET(&ver, EFA_REGS_VERSION_MINOR_VERSION));
 
-	if (ver < MIN_EFA_VER) {
+	EFA_SET(&min_ver, EFA_REGS_VERSION_MAJOR_VERSION,
+		EFA_ADMIN_API_VERSION_MAJOR);
+	EFA_SET(&min_ver, EFA_REGS_VERSION_MINOR_VERSION,
+		EFA_ADMIN_API_VERSION_MINOR);
+	if (ver < min_ver) {
 		ibdev_err(edev->efa_dev,
 			  "EFA version is lower than the minimal version the driver supports\n");
 		return -EOPNOTSUPP;
 	}
 
-	ibdev_dbg(edev->efa_dev,
-		  "efa controller version: %d.%d.%d implementation version %d\n",
-		  (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) >>
-			  EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT,
-		  (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) >>
-			  EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT,
-		  (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK),
-		  (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK) >>
-			  EFA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT);
+	ibdev_dbg(
+		edev->efa_dev,
+		"efa controller version: %d.%d.%d implementation version %d\n",
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION),
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION),
+		EFA_GET(&ctrl_ver,
+			EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION),
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_IMPL_ID));
 
 	ctrl_ver_masked =
-		(ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) |
-		(ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) |
-		(ctrl_ver & EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK);
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION) |
+		EFA_GET(&ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION) |
+		EFA_GET(&ctrl_ver,
+			EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION);
 
+	EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION,
+		EFA_CTRL_MAJOR);
+	EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION,
+		EFA_CTRL_MINOR);
+	EFA_SET(&min_ctrl_ver, EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION,
+		EFA_CTRL_SUB_MINOR);
 	/* Validate the ctrl version without the implementation ID */
-	if (ctrl_ver_masked < MIN_EFA_CTRL_VER) {
+	if (ctrl_ver_masked < min_ctrl_ver) {
 		ibdev_err(edev->efa_dev,
 			  "EFA ctrl version is lower than the minimal ctrl version the driver supports\n");
 		return -EOPNOTSUPP;
@@ -1002,8 +997,7 @@
 	u32 caps = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
 	int width;
 
-	width = (caps & EFA_REGS_CAPS_DMA_ADDR_WIDTH_MASK) >>
-		EFA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT;
+	width = EFA_GET(&caps, EFA_REGS_CAPS_DMA_ADDR_WIDTH);
 
 	ibdev_dbg(edev->efa_dev, "DMA width: %d\n", width);
 
@@ -1017,16 +1011,14 @@
 	return width;
 }
 
-static int wait_for_reset_state(struct efa_com_dev *edev, u32 timeout,
-				u16 exp_state)
+static int wait_for_reset_state(struct efa_com_dev *edev, u32 timeout, int on)
 {
 	u32 val, i;
 
 	for (i = 0; i < timeout; i++) {
 		val = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
 
-		if ((val & EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK) ==
-		    exp_state)
+		if (EFA_GET(&val, EFA_REGS_DEV_STS_RESET_IN_PROGRESS) == on)
 			return 0;
 
 		ibdev_dbg(edev->efa_dev, "Reset indication val %d\n", val);
@@ -1046,36 +1038,34 @@
 int efa_com_dev_reset(struct efa_com_dev *edev,
 		      enum efa_regs_reset_reason_types reset_reason)
 {
-	u32 stat, timeout, cap, reset_val;
+	u32 stat, timeout, cap;
+	u32 reset_val = 0;
 	int err;
 
 	stat = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
 	cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
 
-	if (!(stat & EFA_REGS_DEV_STS_READY_MASK)) {
+	if (!EFA_GET(&stat, EFA_REGS_DEV_STS_READY)) {
 		ibdev_err(edev->efa_dev,
 			  "Device isn't ready, can't reset device\n");
 		return -EINVAL;
 	}
 
-	timeout = (cap & EFA_REGS_CAPS_RESET_TIMEOUT_MASK) >>
-		  EFA_REGS_CAPS_RESET_TIMEOUT_SHIFT;
+	timeout = EFA_GET(&cap, EFA_REGS_CAPS_RESET_TIMEOUT);
 	if (!timeout) {
 		ibdev_err(edev->efa_dev, "Invalid timeout value\n");
 		return -EINVAL;
 	}
 
 	/* start reset */
-	reset_val = EFA_REGS_DEV_CTL_DEV_RESET_MASK;
-	reset_val |= (reset_reason << EFA_REGS_DEV_CTL_RESET_REASON_SHIFT) &
-		     EFA_REGS_DEV_CTL_RESET_REASON_MASK;
+	EFA_SET(&reset_val, EFA_REGS_DEV_CTL_DEV_RESET, 1);
+	EFA_SET(&reset_val, EFA_REGS_DEV_CTL_RESET_REASON, reset_reason);
 	writel(reset_val, edev->reg_bar + EFA_REGS_DEV_CTL_OFF);
 
 	/* reset clears the mmio readless address, restore it */
 	efa_com_mmio_reg_read_resp_addr_init(edev);
 
-	err = wait_for_reset_state(edev, timeout,
-				   EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK);
+	err = wait_for_reset_state(edev, timeout, 1);
 	if (err) {
 		ibdev_err(edev->efa_dev, "Reset indication didn't turn on\n");
 		return err;
@@ -1089,8 +1079,7 @@
 		return err;
 	}
 
-	timeout = (cap & EFA_REGS_CAPS_ADMIN_CMD_TO_MASK) >>
-		  EFA_REGS_CAPS_ADMIN_CMD_TO_SHIFT;
+	timeout = EFA_GET(&cap, EFA_REGS_CAPS_ADMIN_CMD_TO);
 	if (timeout)
 		/* the resolution of timeout reg is 100ms */
 		edev->aq.completion_timeout = timeout * 100000;
diff --git a/drivers/infiniband/hw/efa/efa_com.h b/drivers/infiniband/hw/efa/efa_com.h
index c67dd81..5e4c888 100644
--- a/drivers/infiniband/hw/efa/efa_com.h
+++ b/drivers/infiniband/hw/efa/efa_com.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_COM_H_
@@ -47,6 +47,7 @@
 struct efa_com_stats_admin {
 	atomic64_t submitted_cmd;
 	atomic64_t completed_cmd;
+	atomic64_t cmd_err;
 	atomic64_t no_completion;
 };
 
diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c
index c079f13..f752ef6 100644
--- a/drivers/infiniband/hw/efa/efa_com_cmd.c
+++ b/drivers/infiniband/hw/efa/efa_com_cmd.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include "efa_com.h"
@@ -76,6 +76,7 @@
 	cmd.qkey = params->qkey;
 	cmd.sq_psn = params->sq_psn;
 	cmd.sq_drained_async_notify = params->sq_drained_async_notify;
+	cmd.rnr_retry = params->rnr_retry;
 
 	err = efa_com_cmd_exec(aq,
 			       (struct efa_admin_aq_entry *)&cmd,
@@ -121,6 +122,7 @@
 	result->qkey = resp.qkey;
 	result->sq_draining = resp.sq_draining;
 	result->sq_psn = resp.sq_psn;
+	result->rnr_retry = resp.rnr_retry;
 
 	return 0;
 }
@@ -161,8 +163,9 @@
 	int err;
 
 	create_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_CQ;
-	create_cmd.cq_caps_2 = (params->entry_size_in_bytes / 4) &
-				EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK;
+	EFA_SET(&create_cmd.cq_caps_2,
+		EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS,
+		params->entry_size_in_bytes / 4);
 	create_cmd.cq_depth = params->cq_depth;
 	create_cmd.num_sub_cqs = params->num_sub_cqs;
 	create_cmd.uar = params->uarn;
@@ -227,11 +230,10 @@
 	mr_cmd.aq_common_desc.opcode = EFA_ADMIN_REG_MR;
 	mr_cmd.pd = params->pd;
 	mr_cmd.mr_length = params->mr_length_in_bytes;
-	mr_cmd.flags |= params->page_shift &
-		EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT_MASK;
+	EFA_SET(&mr_cmd.flags, EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT,
+		params->page_shift);
 	mr_cmd.iova = params->iova;
-	mr_cmd.permissions |= params->permissions &
-			      EFA_ADMIN_REG_MR_CMD_LOCAL_WRITE_ENABLE_MASK;
+	mr_cmd.permissions = params->permissions;
 
 	if (params->inline_pbl) {
 		memcpy(mr_cmd.pbl.inline_pbl_array,
@@ -243,11 +245,11 @@
 			params->pbl.pbl.address.mem_addr_low;
 		mr_cmd.pbl.pbl.address.mem_addr_high =
 			params->pbl.pbl.address.mem_addr_high;
-		mr_cmd.aq_common_desc.flags |=
-			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK;
+		EFA_SET(&mr_cmd.aq_common_desc.flags,
+			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1);
 		if (params->indirect)
-			mr_cmd.aq_common_desc.flags |=
-				EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+			EFA_SET(&mr_cmd.aq_common_desc.flags,
+				EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT, 1);
 	}
 
 	err = efa_com_cmd_exec(aq,
@@ -351,7 +353,7 @@
 	return 0;
 }
 
-static bool
+bool
 efa_com_check_supported_feature_id(struct efa_com_dev *edev,
 				   enum efa_admin_aq_feature_id feature_id)
 {
@@ -387,9 +389,8 @@
 	get_cmd.aq_common_descriptor.opcode = EFA_ADMIN_GET_FEATURE;
 
 	if (control_buff_size)
-		get_cmd.aq_common_descriptor.flags =
-			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
-
+		EFA_SET(&get_cmd.aq_common_descriptor.flags,
+			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1);
 
 	efa_com_set_dma_addr(control_buf_dma_addr,
 			     &get_cmd.control_buffer.address.mem_addr_high,
@@ -423,28 +424,6 @@
 	return efa_com_get_feature_ex(edev, get_resp, feature_id, 0, 0);
 }
 
-int efa_com_get_network_attr(struct efa_com_dev *edev,
-			     struct efa_com_get_network_attr_result *result)
-{
-	struct efa_admin_get_feature_resp resp;
-	int err;
-
-	err = efa_com_get_feature(edev, &resp,
-				  EFA_ADMIN_NETWORK_ATTR);
-	if (err) {
-		ibdev_err_ratelimited(edev->efa_dev,
-				      "Failed to get network attributes %d\n",
-				      err);
-		return err;
-	}
-
-	memcpy(result->addr, resp.u.network_attr.addr,
-	       sizeof(resp.u.network_attr.addr));
-	result->mtu = resp.u.network_attr.mtu;
-
-	return 0;
-}
-
 int efa_com_get_device_attr(struct efa_com_dev *edev,
 			    struct efa_com_get_device_attr_result *result)
 {
@@ -467,6 +446,8 @@
 	result->phys_addr_width = resp.u.device_attr.phys_addr_width;
 	result->virt_addr_width = resp.u.device_attr.virt_addr_width;
 	result->db_bar = resp.u.device_attr.db_bar;
+	result->max_rdma_size = resp.u.device_attr.max_rdma_size;
+	result->device_caps = resp.u.device_attr.device_caps;
 
 	if (result->admin_api_version < 1) {
 		ibdev_err_ratelimited(
@@ -500,6 +481,21 @@
 	result->max_ah = resp.u.queue_attr.max_ah;
 	result->max_llq_size = resp.u.queue_attr.max_llq_size;
 	result->sub_cqs_per_cq = resp.u.queue_attr.sub_cqs_per_cq;
+	result->max_wr_rdma_sge = resp.u.queue_attr.max_wr_rdma_sges;
+	result->max_tx_batch = resp.u.queue_attr.max_tx_batch;
+	result->min_sq_depth = resp.u.queue_attr.min_sq_depth;
+
+	err = efa_com_get_feature(edev, &resp, EFA_ADMIN_NETWORK_ATTR);
+	if (err) {
+		ibdev_err_ratelimited(edev->efa_dev,
+				      "Failed to get network attributes %d\n",
+				      err);
+		return err;
+	}
+
+	memcpy(result->addr, resp.u.network_attr.addr,
+	       sizeof(resp.u.network_attr.addr));
+	result->mtu = resp.u.network_attr.mtu;
 
 	return 0;
 }
@@ -525,12 +521,12 @@
 	return 0;
 }
 
-static int efa_com_set_feature_ex(struct efa_com_dev *edev,
-				  struct efa_admin_set_feature_resp *set_resp,
-				  struct efa_admin_set_feature_cmd *set_cmd,
-				  enum efa_admin_aq_feature_id feature_id,
-				  dma_addr_t control_buf_dma_addr,
-				  u32 control_buff_size)
+int efa_com_set_feature_ex(struct efa_com_dev *edev,
+			   struct efa_admin_set_feature_resp *set_resp,
+			   struct efa_admin_set_feature_cmd *set_cmd,
+			   enum efa_admin_aq_feature_id feature_id,
+			   dma_addr_t control_buf_dma_addr,
+			   u32 control_buff_size)
 {
 	struct efa_com_admin_queue *aq;
 	int err;
@@ -546,8 +542,9 @@
 
 	set_cmd->aq_common_descriptor.opcode = EFA_ADMIN_SET_FEATURE;
 	if (control_buff_size) {
-		set_cmd->aq_common_descriptor.flags =
-			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+		set_cmd->aq_common_descriptor.flags = 0;
+		EFA_SET(&set_cmd->aq_common_descriptor.flags,
+			EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA, 1);
 		efa_com_set_dma_addr(control_buf_dma_addr,
 				     &set_cmd->control_buffer.address.mem_addr_high,
 				     &set_cmd->control_buffer.address.mem_addr_low);
@@ -755,11 +752,27 @@
 		return err;
 	}
 
-	result->basic_stats.tx_bytes = resp.basic_stats.tx_bytes;
-	result->basic_stats.tx_pkts = resp.basic_stats.tx_pkts;
-	result->basic_stats.rx_bytes = resp.basic_stats.rx_bytes;
-	result->basic_stats.rx_pkts = resp.basic_stats.rx_pkts;
-	result->basic_stats.rx_drops = resp.basic_stats.rx_drops;
+	switch (cmd.type) {
+	case EFA_ADMIN_GET_STATS_TYPE_BASIC:
+		result->basic_stats.tx_bytes = resp.u.basic_stats.tx_bytes;
+		result->basic_stats.tx_pkts = resp.u.basic_stats.tx_pkts;
+		result->basic_stats.rx_bytes = resp.u.basic_stats.rx_bytes;
+		result->basic_stats.rx_pkts = resp.u.basic_stats.rx_pkts;
+		result->basic_stats.rx_drops = resp.u.basic_stats.rx_drops;
+		break;
+	case EFA_ADMIN_GET_STATS_TYPE_MESSAGES:
+		result->messages_stats.send_bytes = resp.u.messages_stats.send_bytes;
+		result->messages_stats.send_wrs = resp.u.messages_stats.send_wrs;
+		result->messages_stats.recv_bytes = resp.u.messages_stats.recv_bytes;
+		result->messages_stats.recv_wrs = resp.u.messages_stats.recv_wrs;
+		break;
+	case EFA_ADMIN_GET_STATS_TYPE_RDMA_READ:
+		result->rdma_read_stats.read_wrs = resp.u.rdma_read_stats.read_wrs;
+		result->rdma_read_stats.read_bytes = resp.u.rdma_read_stats.read_bytes;
+		result->rdma_read_stats.read_wr_err = resp.u.rdma_read_stats.read_wr_err;
+		result->rdma_read_stats.read_resp_bytes = resp.u.rdma_read_stats.read_resp_bytes;
+		break;
+	}
 
 	return 0;
 }
diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h
index 7f6c130..eea4ebf 100644
--- a/drivers/infiniband/hw/efa/efa_com_cmd.h
+++ b/drivers/infiniband/hw/efa/efa_com_cmd.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_COM_CMD_H_
@@ -47,6 +47,7 @@
 	u32 qkey;
 	u32 sq_psn;
 	u8 sq_drained_async_notify;
+	u8 rnr_retry;
 };
 
 struct efa_com_query_qp_params {
@@ -58,6 +59,7 @@
 	u32 qkey;
 	u32 sq_draining;
 	u32 sq_psn;
+	u8 rnr_retry;
 };
 
 struct efa_com_destroy_qp_params {
@@ -100,14 +102,11 @@
 	u16 pdn;
 };
 
-struct efa_com_get_network_attr_result {
-	u8 addr[EFA_GID_SIZE];
-	u32 mtu;
-};
-
 struct efa_com_get_device_attr_result {
+	u8 addr[EFA_GID_SIZE];
 	u64 page_size_cap;
 	u64 max_mr_pages;
+	u32 mtu;
 	u32 fw_version;
 	u32 admin_api_version;
 	u32 device_version;
@@ -124,9 +123,14 @@
 	u32 max_pd;
 	u32 max_ah;
 	u32 max_llq_size;
+	u32 max_rdma_size;
+	u32 device_caps;
 	u16 sub_cqs_per_cq;
 	u16 max_sq_sge;
 	u16 max_rq_sge;
+	u16 max_wr_rdma_sge;
+	u16 max_tx_batch;
+	u16 min_sq_depth;
 	u8 db_bar;
 };
 
@@ -181,12 +185,7 @@
 	 * address mapping
 	 */
 	u8 page_shift;
-	/*
-	 * permissions
-	 * 0: local_write_enable - Write permissions: value of 1 needed
-	 * for RQ buffers and for RDMA write:1: reserved1 - remote
-	 * access flags, etc
-	 */
+	/* see permissions field of struct efa_admin_reg_mr_cmd */
 	u8 permissions;
 	u8 inline_pbl;
 	u8 indirect;
@@ -241,8 +240,24 @@
 	u64 rx_drops;
 };
 
+struct efa_com_messages_stats {
+	u64 send_bytes;
+	u64 send_wrs;
+	u64 recv_bytes;
+	u64 recv_wrs;
+};
+
+struct efa_com_rdma_read_stats {
+	u64 read_wrs;
+	u64 read_bytes;
+	u64 read_wr_err;
+	u64 read_resp_bytes;
+};
+
 union efa_com_get_stats_result {
 	struct efa_com_basic_stats basic_stats;
+	struct efa_com_messages_stats messages_stats;
+	struct efa_com_rdma_read_stats rdma_read_stats;
 };
 
 void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low);
@@ -271,12 +286,19 @@
 		      struct efa_com_create_ah_result *result);
 int efa_com_destroy_ah(struct efa_com_dev *edev,
 		       struct efa_com_destroy_ah_params *params);
-int efa_com_get_network_attr(struct efa_com_dev *edev,
-			     struct efa_com_get_network_attr_result *result);
 int efa_com_get_device_attr(struct efa_com_dev *edev,
 			    struct efa_com_get_device_attr_result *result);
 int efa_com_get_hw_hints(struct efa_com_dev *edev,
 			 struct efa_com_get_hw_hints_result *result);
+bool
+efa_com_check_supported_feature_id(struct efa_com_dev *edev,
+				   enum efa_admin_aq_feature_id feature_id);
+int efa_com_set_feature_ex(struct efa_com_dev *edev,
+			   struct efa_admin_set_feature_resp *set_resp,
+			   struct efa_admin_set_feature_cmd *set_cmd,
+			   enum efa_admin_aq_feature_id feature_id,
+			   dma_addr_t control_buf_dma_addr,
+			   u32 control_buff_size);
 int efa_com_set_aenq_config(struct efa_com_dev *edev, u32 groups);
 int efa_com_alloc_pd(struct efa_com_dev *edev,
 		     struct efa_com_alloc_pd_result *result);
diff --git a/drivers/infiniband/hw/efa/efa_common_defs.h b/drivers/infiniband/hw/efa/efa_common_defs.h
index c559ec0..90af1c8 100644
--- a/drivers/infiniband/hw/efa/efa_common_defs.h
+++ b/drivers/infiniband/hw/efa/efa_common_defs.h
@@ -1,14 +1,25 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_COMMON_H_
 #define _EFA_COMMON_H_
 
+#include <linux/bitfield.h>
+
 #define EFA_COMMON_SPEC_VERSION_MAJOR        2
 #define EFA_COMMON_SPEC_VERSION_MINOR        0
 
+#define EFA_GET(ptr, mask) FIELD_GET(mask##_MASK, *(ptr))
+
+#define EFA_SET(ptr, mask, value)                                              \
+	({                                                                     \
+		typeof(ptr) _ptr = ptr;                                        \
+		*_ptr = (*_ptr & ~(mask##_MASK)) |                             \
+			FIELD_PREP(mask##_MASK, value);                        \
+	})
+
 struct efa_common_mem_addr {
 	u32 mem_addr_low;
 
diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c
index 75dfe9d..ffdd18f 100644
--- a/drivers/infiniband/hw/efa/efa_main.c
+++ b/drivers/infiniband/hw/efa/efa_main.c
@@ -1,19 +1,23 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
 
 #include <rdma/ib_user_verbs.h>
 
 #include "efa.h"
 
-#define PCI_DEV_ID_EFA_VF 0xefa0
+#define PCI_DEV_ID_EFA0_VF 0xefa0
+#define PCI_DEV_ID_EFA1_VF 0xefa1
 
 static const struct pci_device_id efa_pci_tbl[] = {
-	{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA_VF) },
+	{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA0_VF) },
+	{ PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA1_VF) },
 	{ }
 };
 
@@ -30,15 +34,6 @@
 	(BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
 	 BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
 
-static void efa_update_network_attr(struct efa_dev *dev,
-				    struct efa_com_get_network_attr_result *network_attr)
-{
-	memcpy(dev->addr, network_attr->addr, sizeof(network_attr->addr));
-	dev->mtu = network_attr->mtu;
-
-	dev_dbg(&dev->pdev->dev, "Full address %pI6\n", dev->addr);
-}
-
 /* This handler will called for unknown event group or unimplemented handlers */
 static void unimplemented_aenq_handler(void *data,
 				       struct efa_admin_aenq_entry *aenq_e)
@@ -196,6 +191,52 @@
 		atomic64_set(s, 0);
 }
 
+static void efa_set_host_info(struct efa_dev *dev)
+{
+	struct efa_admin_set_feature_resp resp = {};
+	struct efa_admin_set_feature_cmd cmd = {};
+	struct efa_admin_host_info *hinf;
+	u32 bufsz = sizeof(*hinf);
+	dma_addr_t hinf_dma;
+
+	if (!efa_com_check_supported_feature_id(&dev->edev,
+						EFA_ADMIN_HOST_INFO))
+		return;
+
+	/* Failures in host info set shall not disturb probe */
+	hinf = dma_alloc_coherent(&dev->pdev->dev, bufsz, &hinf_dma,
+				  GFP_KERNEL);
+	if (!hinf)
+		return;
+
+	strlcpy(hinf->os_dist_str, utsname()->release,
+		min(sizeof(hinf->os_dist_str), sizeof(utsname()->release)));
+	hinf->os_type = EFA_ADMIN_OS_LINUX;
+	strlcpy(hinf->kernel_ver_str, utsname()->version,
+		min(sizeof(hinf->kernel_ver_str), sizeof(utsname()->version)));
+	hinf->kernel_ver = LINUX_VERSION_CODE;
+	EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MAJOR, 0);
+	EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MINOR, 0);
+	EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_SUB_MINOR, 0);
+	EFA_SET(&hinf->driver_ver, EFA_ADMIN_HOST_INFO_DRIVER_MODULE_TYPE, 0);
+	EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_BUS, dev->pdev->bus->number);
+	EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_DEVICE,
+		PCI_SLOT(dev->pdev->devfn));
+	EFA_SET(&hinf->bdf, EFA_ADMIN_HOST_INFO_FUNCTION,
+		PCI_FUNC(dev->pdev->devfn));
+	EFA_SET(&hinf->spec_ver, EFA_ADMIN_HOST_INFO_SPEC_MAJOR,
+		EFA_COMMON_SPEC_VERSION_MAJOR);
+	EFA_SET(&hinf->spec_ver, EFA_ADMIN_HOST_INFO_SPEC_MINOR,
+		EFA_COMMON_SPEC_VERSION_MINOR);
+	EFA_SET(&hinf->flags, EFA_ADMIN_HOST_INFO_INTREE, 1);
+	EFA_SET(&hinf->flags, EFA_ADMIN_HOST_INFO_GDR, 0);
+
+	efa_com_set_feature_ex(&dev->edev, &resp, &cmd, EFA_ADMIN_HOST_INFO,
+			       hinf_dma, bufsz);
+
+	dma_free_coherent(&dev->pdev->dev, bufsz, hinf, hinf_dma);
+}
+
 static const struct ib_device_ops efa_dev_ops = {
 	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_EFA,
@@ -217,6 +258,7 @@
 	.get_link_layer = efa_port_link_layer,
 	.get_port_immutable = efa_get_port_immutable,
 	.mmap = efa_mmap,
+	.mmap_free = efa_mmap_free,
 	.modify_qp = efa_modify_qp,
 	.query_device = efa_query_device,
 	.query_gid = efa_query_gid,
@@ -233,7 +275,6 @@
 
 static int efa_ib_device_add(struct efa_dev *dev)
 {
-	struct efa_com_get_network_attr_result network_attr;
 	struct efa_com_get_hw_hints_result hw_hints;
 	struct pci_dev *pdev = dev->pdev;
 	int err;
@@ -249,12 +290,6 @@
 	if (err)
 		return err;
 
-	err = efa_com_get_network_attr(&dev->edev, &network_attr);
-	if (err)
-		goto err_release_doorbell_bar;
-
-	efa_update_network_attr(dev, &network_attr);
-
 	err = efa_com_get_hw_hints(&dev->edev, &hw_hints);
 	if (err)
 		goto err_release_doorbell_bar;
@@ -266,6 +301,8 @@
 	if (err)
 		goto err_release_doorbell_bar;
 
+	efa_set_host_info(dev);
+
 	dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED;
 	dev->ibdev.phys_port_cnt = 1;
 	dev->ibdev.num_comp_vectors = 1;
@@ -294,7 +331,7 @@
 
 	ib_set_device_ops(&dev->ibdev, &efa_dev_ops);
 
-	err = ib_register_device(&dev->ibdev, "efa_%d");
+	err = ib_register_device(&dev->ibdev, "efa_%d", &pdev->dev);
 	if (err)
 		goto err_release_doorbell_bar;
 
@@ -382,7 +419,7 @@
 			err);
 		return err;
 	}
-
+	dma_set_max_seg_size(&pdev->dev, UINT_MAX);
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/efa/efa_regs_defs.h b/drivers/infiniband/hw/efa/efa_regs_defs.h
index bb9cad3..4017982 100644
--- a/drivers/infiniband/hw/efa/efa_regs_defs.h
+++ b/drivers/infiniband/hw/efa/efa_regs_defs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #ifndef _EFA_REGS_H_
@@ -45,69 +45,52 @@
 
 /* version register */
 #define EFA_REGS_VERSION_MINOR_VERSION_MASK                 0xff
-#define EFA_REGS_VERSION_MAJOR_VERSION_SHIFT                8
 #define EFA_REGS_VERSION_MAJOR_VERSION_MASK                 0xff00
 
 /* controller_version register */
 #define EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK   0xff
-#define EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT     8
 #define EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK      0xff00
-#define EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT     16
 #define EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK      0xff0000
-#define EFA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT           24
 #define EFA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK            0xff000000
 
 /* caps register */
 #define EFA_REGS_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK        0x1
-#define EFA_REGS_CAPS_RESET_TIMEOUT_SHIFT                   1
 #define EFA_REGS_CAPS_RESET_TIMEOUT_MASK                    0x3e
-#define EFA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT                  8
 #define EFA_REGS_CAPS_DMA_ADDR_WIDTH_MASK                   0xff00
-#define EFA_REGS_CAPS_ADMIN_CMD_TO_SHIFT                    16
 #define EFA_REGS_CAPS_ADMIN_CMD_TO_MASK                     0xf0000
 
 /* aq_caps register */
 #define EFA_REGS_AQ_CAPS_AQ_DEPTH_MASK                      0xffff
-#define EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT                16
 #define EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK                 0xffff0000
 
 /* acq_caps register */
 #define EFA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK                    0xffff
-#define EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT              16
 #define EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK               0xff0000
-#define EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_SHIFT             24
 #define EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_MASK              0xff000000
 
 /* aenq_caps register */
 #define EFA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK                  0xffff
-#define EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT            16
 #define EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK             0xff0000
-#define EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_SHIFT           24
 #define EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_MASK            0xff000000
 
+/* intr_mask register */
+#define EFA_REGS_INTR_MASK_EN_MASK                          0x1
+
 /* dev_ctl register */
 #define EFA_REGS_DEV_CTL_DEV_RESET_MASK                     0x1
-#define EFA_REGS_DEV_CTL_AQ_RESTART_SHIFT                   1
 #define EFA_REGS_DEV_CTL_AQ_RESTART_MASK                    0x2
-#define EFA_REGS_DEV_CTL_RESET_REASON_SHIFT                 28
 #define EFA_REGS_DEV_CTL_RESET_REASON_MASK                  0xf0000000
 
 /* dev_sts register */
 #define EFA_REGS_DEV_STS_READY_MASK                         0x1
-#define EFA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_SHIFT       1
 #define EFA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK        0x2
-#define EFA_REGS_DEV_STS_AQ_RESTART_FINISHED_SHIFT          2
 #define EFA_REGS_DEV_STS_AQ_RESTART_FINISHED_MASK           0x4
-#define EFA_REGS_DEV_STS_RESET_IN_PROGRESS_SHIFT            3
 #define EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK             0x8
-#define EFA_REGS_DEV_STS_RESET_FINISHED_SHIFT               4
 #define EFA_REGS_DEV_STS_RESET_FINISHED_MASK                0x10
-#define EFA_REGS_DEV_STS_FATAL_ERROR_SHIFT                  5
 #define EFA_REGS_DEV_STS_FATAL_ERROR_MASK                   0x20
 
 /* mmio_reg_read register */
 #define EFA_REGS_MMIO_REG_READ_REQ_ID_MASK                  0xffff
-#define EFA_REGS_MMIO_REG_READ_REG_OFF_SHIFT                16
 #define EFA_REGS_MMIO_REG_READ_REG_OFF_MASK                 0xffff0000
 
 #endif /* _EFA_REGS_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index 17f1e59..2ece682 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -1,9 +1,10 @@
 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved.
  */
 
 #include <linux/vmalloc.h>
+#include <linux/log2.h>
 
 #include <rdma/ib_addr.h>
 #include <rdma/ib_umem.h>
@@ -13,10 +14,6 @@
 
 #include "efa.h"
 
-#define EFA_MMAP_FLAG_SHIFT 56
-#define EFA_MMAP_PAGE_MASK GENMASK(EFA_MMAP_FLAG_SHIFT - 1, 0)
-#define EFA_MMAP_INVALID U64_MAX
-
 enum {
 	EFA_MMAP_DMA_PAGE = 0,
 	EFA_MMAP_IO_WC,
@@ -27,35 +24,38 @@
 	(BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
 	 BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
 
-struct efa_mmap_entry {
-	void  *obj;
+struct efa_user_mmap_entry {
+	struct rdma_user_mmap_entry rdma_entry;
 	u64 address;
-	u64 length;
-	u32 mmap_page;
 	u8 mmap_flag;
 };
 
-static inline u64 get_mmap_key(const struct efa_mmap_entry *efa)
-{
-	return ((u64)efa->mmap_flag << EFA_MMAP_FLAG_SHIFT) |
-	       ((u64)efa->mmap_page << PAGE_SHIFT);
-}
-
 #define EFA_DEFINE_STATS(op) \
 	op(EFA_TX_BYTES, "tx_bytes") \
 	op(EFA_TX_PKTS, "tx_pkts") \
 	op(EFA_RX_BYTES, "rx_bytes") \
 	op(EFA_RX_PKTS, "rx_pkts") \
 	op(EFA_RX_DROPS, "rx_drops") \
+	op(EFA_SEND_BYTES, "send_bytes") \
+	op(EFA_SEND_WRS, "send_wrs") \
+	op(EFA_RECV_BYTES, "recv_bytes") \
+	op(EFA_RECV_WRS, "recv_wrs") \
+	op(EFA_RDMA_READ_WRS, "rdma_read_wrs") \
+	op(EFA_RDMA_READ_BYTES, "rdma_read_bytes") \
+	op(EFA_RDMA_READ_WR_ERR, "rdma_read_wr_err") \
+	op(EFA_RDMA_READ_RESP_BYTES, "rdma_read_resp_bytes") \
 	op(EFA_SUBMITTED_CMDS, "submitted_cmds") \
 	op(EFA_COMPLETED_CMDS, "completed_cmds") \
+	op(EFA_CMDS_ERR, "cmds_err") \
 	op(EFA_NO_COMPLETION_CMDS, "no_completion_cmds") \
 	op(EFA_KEEP_ALIVE_RCVD, "keep_alive_rcvd") \
 	op(EFA_ALLOC_PD_ERR, "alloc_pd_err") \
 	op(EFA_CREATE_QP_ERR, "create_qp_err") \
+	op(EFA_CREATE_CQ_ERR, "create_cq_err") \
 	op(EFA_REG_MR_ERR, "reg_mr_err") \
 	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
-	op(EFA_CREATE_AH_ERR, "create_ah_err")
+	op(EFA_CREATE_AH_ERR, "create_ah_err") \
+	op(EFA_MMAP_ERR, "mmap_err")
 
 #define EFA_STATS_ENUM(ename, name) ename,
 #define EFA_STATS_STR(ename, name) [ename] = name,
@@ -82,8 +82,6 @@
 #define EFA_CHUNK_USED_SIZE \
 	((EFA_PTRS_PER_CHUNK * EFA_CHUNK_PAYLOAD_PTR_SIZE) + EFA_CHUNK_PTR_SIZE)
 
-#define EFA_SUPPORTED_ACCESS_FLAGS IB_ACCESS_LOCAL_WRITE
-
 struct pbl_chunk {
 	dma_addr_t dma_addr;
 	u64 *buf;
@@ -147,8 +145,15 @@
 	return container_of(ibah, struct efa_ah, ibah);
 }
 
-#define field_avail(x, fld, sz) (offsetof(typeof(x), fld) + \
-				 FIELD_SIZEOF(typeof(x), fld) <= (sz))
+static inline struct efa_user_mmap_entry *
+to_emmap(struct rdma_user_mmap_entry *rdma_entry)
+{
+	return container_of(rdma_entry, struct efa_user_mmap_entry, rdma_entry);
+}
+
+#define EFA_DEV_CAP(dev, cap) \
+	((dev)->dev_attr.device_caps & \
+	 EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_##cap##_MASK)
 
 #define is_reserved_cleared(reserved) \
 	!memchr_inv(reserved, 0, sizeof(reserved))
@@ -172,104 +177,12 @@
 	return addr;
 }
 
-/*
- * This is only called when the ucontext is destroyed and there can be no
- * concurrent query via mmap or allocate on the xarray, thus we can be sure no
- * other thread is using the entry pointer. We also know that all the BAR
- * pages have either been zap'd or munmaped at this point.  Normal pages are
- * refcounted and will be freed at the proper time.
- */
-static void mmap_entries_remove_free(struct efa_dev *dev,
-				     struct efa_ucontext *ucontext)
+static void efa_free_mapped(struct efa_dev *dev, void *cpu_addr,
+			    dma_addr_t dma_addr,
+			    size_t size, enum dma_data_direction dir)
 {
-	struct efa_mmap_entry *entry;
-	unsigned long mmap_page;
-
-	xa_for_each(&ucontext->mmap_xa, mmap_page, entry) {
-		xa_erase(&ucontext->mmap_xa, mmap_page);
-
-		ibdev_dbg(
-			&dev->ibdev,
-			"mmap: obj[0x%p] key[%#llx] addr[%#llx] len[%#llx] removed\n",
-			entry->obj, get_mmap_key(entry), entry->address,
-			entry->length);
-		if (entry->mmap_flag == EFA_MMAP_DMA_PAGE)
-			/* DMA mapping is already gone, now free the pages */
-			free_pages_exact(phys_to_virt(entry->address),
-					 entry->length);
-		kfree(entry);
-	}
-}
-
-static struct efa_mmap_entry *mmap_entry_get(struct efa_dev *dev,
-					     struct efa_ucontext *ucontext,
-					     u64 key, u64 len)
-{
-	struct efa_mmap_entry *entry;
-	u64 mmap_page;
-
-	mmap_page = (key & EFA_MMAP_PAGE_MASK) >> PAGE_SHIFT;
-	if (mmap_page > U32_MAX)
-		return NULL;
-
-	entry = xa_load(&ucontext->mmap_xa, mmap_page);
-	if (!entry || get_mmap_key(entry) != key || entry->length != len)
-		return NULL;
-
-	ibdev_dbg(&dev->ibdev,
-		  "mmap: obj[0x%p] key[%#llx] addr[%#llx] len[%#llx] removed\n",
-		  entry->obj, key, entry->address, entry->length);
-
-	return entry;
-}
-
-/*
- * Note this locking scheme cannot support removal of entries, except during
- * ucontext destruction when the core code guarentees no concurrency.
- */
-static u64 mmap_entry_insert(struct efa_dev *dev, struct efa_ucontext *ucontext,
-			     void *obj, u64 address, u64 length, u8 mmap_flag)
-{
-	struct efa_mmap_entry *entry;
-	u32 next_mmap_page;
-	int err;
-
-	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		return EFA_MMAP_INVALID;
-
-	entry->obj = obj;
-	entry->address = address;
-	entry->length = length;
-	entry->mmap_flag = mmap_flag;
-
-	xa_lock(&ucontext->mmap_xa);
-	if (check_add_overflow(ucontext->mmap_xa_page,
-			       (u32)(length >> PAGE_SHIFT),
-			       &next_mmap_page))
-		goto err_unlock;
-
-	entry->mmap_page = ucontext->mmap_xa_page;
-	ucontext->mmap_xa_page = next_mmap_page;
-	err = __xa_insert(&ucontext->mmap_xa, entry->mmap_page, entry,
-			  GFP_KERNEL);
-	if (err)
-		goto err_unlock;
-
-	xa_unlock(&ucontext->mmap_xa);
-
-	ibdev_dbg(
-		&dev->ibdev,
-		"mmap: obj[0x%p] addr[%#llx], len[%#llx], key[%#llx] inserted\n",
-		entry->obj, entry->address, entry->length, get_mmap_key(entry));
-
-	return get_mmap_key(entry);
-
-err_unlock:
-	xa_unlock(&ucontext->mmap_xa);
-	kfree(entry);
-	return EFA_MMAP_INVALID;
-
+	dma_unmap_single(&dev->pdev->dev, dma_addr, size, dir);
+	free_pages_exact(cpu_addr, size);
 }
 
 int efa_query_device(struct ib_device *ibdev,
@@ -306,12 +219,21 @@
 				 dev_attr->max_rq_depth);
 	props->max_send_sge = dev_attr->max_sq_sge;
 	props->max_recv_sge = dev_attr->max_rq_sge;
+	props->max_sge_rd = dev_attr->max_wr_rdma_sge;
+	props->max_pkeys = 1;
 
 	if (udata && udata->outlen) {
 		resp.max_sq_sge = dev_attr->max_sq_sge;
 		resp.max_rq_sge = dev_attr->max_rq_sge;
 		resp.max_sq_wr = dev_attr->max_sq_depth;
 		resp.max_rq_wr = dev_attr->max_rq_depth;
+		resp.max_rdma_size = dev_attr->max_rdma_size;
+
+		if (EFA_DEV_CAP(dev, RDMA_READ))
+			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_READ;
+
+		if (EFA_DEV_CAP(dev, RNR_RETRY))
+			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RNR_RETRY;
 
 		err = ib_copy_to_udata(udata, &resp,
 				       min(sizeof(resp), udata->outlen));
@@ -338,9 +260,9 @@
 	props->pkey_tbl_len = 1;
 	props->active_speed = IB_SPEED_EDR;
 	props->active_width = IB_WIDTH_4X;
-	props->max_mtu = ib_mtu_int_to_enum(dev->mtu);
-	props->active_mtu = ib_mtu_int_to_enum(dev->mtu);
-	props->max_msg_sz = dev->mtu;
+	props->max_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu);
+	props->active_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu);
+	props->max_msg_sz = dev->dev_attr.mtu;
 	props->max_vl_num = 1;
 
 	return 0;
@@ -358,7 +280,7 @@
 
 #define EFA_QUERY_QP_SUPP_MASK \
 	(IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | \
-	 IB_QP_QKEY | IB_QP_SQ_PSN | IB_QP_CAP)
+	 IB_QP_QKEY | IB_QP_SQ_PSN | IB_QP_CAP | IB_QP_RNR_RETRY)
 
 	if (qp_attr_mask & ~EFA_QUERY_QP_SUPP_MASK) {
 		ibdev_dbg(&dev->ibdev,
@@ -380,6 +302,7 @@
 	qp_attr->sq_psn = result.sq_psn;
 	qp_attr->sq_draining = result.sq_draining;
 	qp_attr->port_num = 1;
+	qp_attr->rnr_retry = result.rnr_retry;
 
 	qp_attr->cap.max_send_wr = qp->max_send_wr;
 	qp_attr->cap.max_recv_wr = qp->max_recv_wr;
@@ -401,7 +324,7 @@
 {
 	struct efa_dev *dev = to_edev(ibdev);
 
-	memcpy(gid->raw, dev->addr, sizeof(dev->addr));
+	memcpy(gid->raw, dev->dev_attr.addr, sizeof(dev->dev_attr.addr));
 
 	return 0;
 }
@@ -465,17 +388,18 @@
 err_dealloc_pd:
 	efa_pd_dealloc(dev, result.pdn);
 err_out:
-	atomic64_inc(&dev->stats.sw_stats.alloc_pd_err);
+	atomic64_inc(&dev->stats.alloc_pd_err);
 	return err;
 }
 
-void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+int efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
 	struct efa_dev *dev = to_edev(ibpd->device);
 	struct efa_pd *pd = to_epd(ibpd);
 
 	ibdev_dbg(&dev->ibdev, "Dealloc pd[%d]\n", pd->pdn);
 	efa_pd_dealloc(dev, pd->pdn);
+	return 0;
 }
 
 static int efa_destroy_qp_handle(struct efa_dev *dev, u32 qp_handle)
@@ -485,6 +409,14 @@
 	return efa_com_destroy_qp(&dev->edev, &params);
 }
 
+static void efa_qp_user_mmap_entries_remove(struct efa_qp *qp)
+{
+	rdma_user_mmap_entry_remove(qp->rq_mmap_entry);
+	rdma_user_mmap_entry_remove(qp->rq_db_mmap_entry);
+	rdma_user_mmap_entry_remove(qp->llq_desc_mmap_entry);
+	rdma_user_mmap_entry_remove(qp->sq_db_mmap_entry);
+}
+
 int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
 	struct efa_dev *dev = to_edev(ibqp->pd->device);
@@ -492,6 +424,9 @@
 	int err;
 
 	ibdev_dbg(&dev->ibdev, "Destroy qp[%u]\n", ibqp->qp_num);
+
+	efa_qp_user_mmap_entries_remove(qp);
+
 	err = efa_destroy_qp_handle(dev, qp->qp_handle);
 	if (err)
 		return err;
@@ -501,65 +436,104 @@
 			  "qp->cpu_addr[0x%p] freed: size[%lu], dma[%pad]\n",
 			  qp->rq_cpu_addr, qp->rq_size,
 			  &qp->rq_dma_addr);
-		dma_unmap_single(&dev->pdev->dev, qp->rq_dma_addr, qp->rq_size,
-				 DMA_TO_DEVICE);
+		efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr,
+				qp->rq_size, DMA_TO_DEVICE);
 	}
 
 	kfree(qp);
 	return 0;
 }
 
+static struct rdma_user_mmap_entry*
+efa_user_mmap_entry_insert(struct ib_ucontext *ucontext,
+			   u64 address, size_t length,
+			   u8 mmap_flag, u64 *offset)
+{
+	struct efa_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	int err;
+
+	if (!entry)
+		return NULL;
+
+	entry->address = address;
+	entry->mmap_flag = mmap_flag;
+
+	err = rdma_user_mmap_entry_insert(ucontext, &entry->rdma_entry,
+					  length);
+	if (err) {
+		kfree(entry);
+		return NULL;
+	}
+	*offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
+
+	return &entry->rdma_entry;
+}
+
 static int qp_mmap_entries_setup(struct efa_qp *qp,
 				 struct efa_dev *dev,
 				 struct efa_ucontext *ucontext,
 				 struct efa_com_create_qp_params *params,
 				 struct efa_ibv_create_qp_resp *resp)
 {
-	/*
-	 * Once an entry is inserted it might be mmapped, hence cannot be
-	 * cleaned up until dealloc_ucontext.
-	 */
-	resp->sq_db_mmap_key =
-		mmap_entry_insert(dev, ucontext, qp,
-				  dev->db_bar_addr + resp->sq_db_offset,
-				  PAGE_SIZE, EFA_MMAP_IO_NC);
-	if (resp->sq_db_mmap_key == EFA_MMAP_INVALID)
+	size_t length;
+	u64 address;
+
+	address = dev->db_bar_addr + resp->sq_db_offset;
+	qp->sq_db_mmap_entry =
+		efa_user_mmap_entry_insert(&ucontext->ibucontext,
+					   address,
+					   PAGE_SIZE, EFA_MMAP_IO_NC,
+					   &resp->sq_db_mmap_key);
+	if (!qp->sq_db_mmap_entry)
 		return -ENOMEM;
 
 	resp->sq_db_offset &= ~PAGE_MASK;
 
-	resp->llq_desc_mmap_key =
-		mmap_entry_insert(dev, ucontext, qp,
-				  dev->mem_bar_addr + resp->llq_desc_offset,
-				  PAGE_ALIGN(params->sq_ring_size_in_bytes +
-					     (resp->llq_desc_offset & ~PAGE_MASK)),
-				  EFA_MMAP_IO_WC);
-	if (resp->llq_desc_mmap_key == EFA_MMAP_INVALID)
-		return -ENOMEM;
+	address = dev->mem_bar_addr + resp->llq_desc_offset;
+	length = PAGE_ALIGN(params->sq_ring_size_in_bytes +
+			    (resp->llq_desc_offset & ~PAGE_MASK));
+
+	qp->llq_desc_mmap_entry =
+		efa_user_mmap_entry_insert(&ucontext->ibucontext,
+					   address, length,
+					   EFA_MMAP_IO_WC,
+					   &resp->llq_desc_mmap_key);
+	if (!qp->llq_desc_mmap_entry)
+		goto err_remove_mmap;
 
 	resp->llq_desc_offset &= ~PAGE_MASK;
 
 	if (qp->rq_size) {
-		resp->rq_db_mmap_key =
-			mmap_entry_insert(dev, ucontext, qp,
-					  dev->db_bar_addr + resp->rq_db_offset,
-					  PAGE_SIZE, EFA_MMAP_IO_NC);
-		if (resp->rq_db_mmap_key == EFA_MMAP_INVALID)
-			return -ENOMEM;
+		address = dev->db_bar_addr + resp->rq_db_offset;
+
+		qp->rq_db_mmap_entry =
+			efa_user_mmap_entry_insert(&ucontext->ibucontext,
+						   address, PAGE_SIZE,
+						   EFA_MMAP_IO_NC,
+						   &resp->rq_db_mmap_key);
+		if (!qp->rq_db_mmap_entry)
+			goto err_remove_mmap;
 
 		resp->rq_db_offset &= ~PAGE_MASK;
 
-		resp->rq_mmap_key =
-			mmap_entry_insert(dev, ucontext, qp,
-					  virt_to_phys(qp->rq_cpu_addr),
-					  qp->rq_size, EFA_MMAP_DMA_PAGE);
-		if (resp->rq_mmap_key == EFA_MMAP_INVALID)
-			return -ENOMEM;
+		address = virt_to_phys(qp->rq_cpu_addr);
+		qp->rq_mmap_entry =
+			efa_user_mmap_entry_insert(&ucontext->ibucontext,
+						   address, qp->rq_size,
+						   EFA_MMAP_DMA_PAGE,
+						   &resp->rq_mmap_key);
+		if (!qp->rq_mmap_entry)
+			goto err_remove_mmap;
 
 		resp->rq_mmap_size = qp->rq_size;
 	}
 
 	return 0;
+
+err_remove_mmap:
+	efa_qp_user_mmap_entries_remove(qp);
+
+	return -ENOMEM;
 }
 
 static int efa_qp_validate_cap(struct efa_dev *dev,
@@ -634,7 +608,6 @@
 	struct efa_dev *dev = to_edev(ibpd->device);
 	struct efa_ibv_create_qp_resp resp = {};
 	struct efa_ibv_create_qp cmd = {};
-	bool rq_entry_inserted = false;
 	struct efa_ucontext *ucontext;
 	struct efa_qp *qp;
 	int err;
@@ -650,7 +623,7 @@
 	if (err)
 		goto err_out;
 
-	if (!field_avail(cmd, driver_qp_type, udata->inlen)) {
+	if (offsetofend(typeof(cmd), driver_qp_type) > udata->inlen) {
 		ibdev_dbg(&dev->ibdev,
 			  "Incompatible ABI params, no input udata\n");
 		err = -EINVAL;
@@ -742,7 +715,6 @@
 	if (err)
 		goto err_destroy_qp;
 
-	rq_entry_inserted = true;
 	qp->qp_handle = create_qp_resp.qp_handle;
 	qp->ibqp.qp_num = create_qp_resp.qp_num;
 	qp->max_send_wr = init_attr->cap.max_send_wr;
@@ -758,7 +730,7 @@
 			ibdev_dbg(&dev->ibdev,
 				  "Failed to copy udata for qp[%u]\n",
 				  create_qp_resp.qp_num);
-			goto err_destroy_qp;
+			goto err_remove_mmap_entries;
 		}
 	}
 
@@ -766,30 +738,141 @@
 
 	return &qp->ibqp;
 
+err_remove_mmap_entries:
+	efa_qp_user_mmap_entries_remove(qp);
 err_destroy_qp:
 	efa_destroy_qp_handle(dev, create_qp_resp.qp_handle);
 err_free_mapped:
-	if (qp->rq_size) {
-		dma_unmap_single(&dev->pdev->dev, qp->rq_dma_addr, qp->rq_size,
-				 DMA_TO_DEVICE);
-		if (!rq_entry_inserted)
-			free_pages_exact(qp->rq_cpu_addr, qp->rq_size);
-	}
+	if (qp->rq_size)
+		efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr,
+				qp->rq_size, DMA_TO_DEVICE);
 err_free_qp:
 	kfree(qp);
 err_out:
-	atomic64_inc(&dev->stats.sw_stats.create_qp_err);
+	atomic64_inc(&dev->stats.create_qp_err);
 	return ERR_PTR(err);
 }
 
+static const struct {
+	int			valid;
+	enum ib_qp_attr_mask	req_param;
+	enum ib_qp_attr_mask	opt_param;
+} srd_qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.req_param = IB_QP_PKEY_INDEX |
+				     IB_QP_PORT |
+				     IB_QP_QKEY,
+		},
+	},
+	[IB_QPS_INIT] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.opt_param = IB_QP_PKEY_INDEX |
+				     IB_QP_PORT |
+				     IB_QP_QKEY,
+		},
+		[IB_QPS_RTR]   = {
+			.valid = 1,
+			.opt_param = IB_QP_PKEY_INDEX |
+				     IB_QP_QKEY,
+		},
+	},
+	[IB_QPS_RTR] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.req_param = IB_QP_SQ_PSN,
+			.opt_param = IB_QP_CUR_STATE |
+				     IB_QP_QKEY |
+				     IB_QP_RNR_RETRY,
+
+		}
+	},
+	[IB_QPS_RTS] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = IB_QP_CUR_STATE |
+				     IB_QP_QKEY,
+		},
+		[IB_QPS_SQD] = {
+			.valid = 1,
+			.opt_param = IB_QP_EN_SQD_ASYNC_NOTIFY,
+		},
+	},
+	[IB_QPS_SQD] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = IB_QP_CUR_STATE |
+				     IB_QP_QKEY,
+		},
+		[IB_QPS_SQD] = {
+			.valid = 1,
+			.opt_param = IB_QP_PKEY_INDEX |
+				     IB_QP_QKEY,
+		}
+	},
+	[IB_QPS_SQE] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = IB_QP_CUR_STATE |
+				     IB_QP_QKEY,
+		}
+	},
+	[IB_QPS_ERR] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR]   = { .valid = 1 },
+	}
+};
+
+static bool efa_modify_srd_qp_is_ok(enum ib_qp_state cur_state,
+				    enum ib_qp_state next_state,
+				    enum ib_qp_attr_mask mask)
+{
+	enum ib_qp_attr_mask req_param, opt_param;
+
+	if (mask & IB_QP_CUR_STATE  &&
+	    cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+	    cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+		return false;
+
+	if (!srd_qp_state_table[cur_state][next_state].valid)
+		return false;
+
+	req_param = srd_qp_state_table[cur_state][next_state].req_param;
+	opt_param = srd_qp_state_table[cur_state][next_state].opt_param;
+
+	if ((mask & req_param) != req_param)
+		return false;
+
+	if (mask & ~(req_param | opt_param | IB_QP_STATE))
+		return false;
+
+	return true;
+}
+
 static int efa_modify_qp_validate(struct efa_dev *dev, struct efa_qp *qp,
 				  struct ib_qp_attr *qp_attr, int qp_attr_mask,
 				  enum ib_qp_state cur_state,
 				  enum ib_qp_state new_state)
 {
+	int err;
+
 #define EFA_MODIFY_QP_SUPP_MASK \
 	(IB_QP_STATE | IB_QP_CUR_STATE | IB_QP_EN_SQD_ASYNC_NOTIFY | \
-	 IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY | IB_QP_SQ_PSN)
+	 IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY | IB_QP_SQ_PSN | \
+	 IB_QP_RNR_RETRY)
 
 	if (qp_attr_mask & ~EFA_MODIFY_QP_SUPP_MASK) {
 		ibdev_dbg(&dev->ibdev,
@@ -798,8 +881,14 @@
 		return -EOPNOTSUPP;
 	}
 
-	if (!ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD,
-				qp_attr_mask)) {
+	if (qp->ibqp.qp_type == IB_QPT_DRIVER)
+		err = !efa_modify_srd_qp_is_ok(cur_state, new_state,
+					       qp_attr_mask);
+	else
+		err = !ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD,
+					  qp_attr_mask);
+
+	if (err) {
 		ibdev_dbg(&dev->ibdev, "Invalid modify QP parameters\n");
 		return -EINVAL;
 	}
@@ -846,28 +935,36 @@
 	params.qp_handle = qp->qp_handle;
 
 	if (qp_attr_mask & IB_QP_STATE) {
-		params.modify_mask |= BIT(EFA_ADMIN_QP_STATE_BIT) |
-				      BIT(EFA_ADMIN_CUR_QP_STATE_BIT);
-		params.cur_qp_state = qp_attr->cur_qp_state;
-		params.qp_state = qp_attr->qp_state;
+		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_QP_STATE,
+			1);
+		EFA_SET(&params.modify_mask,
+			EFA_ADMIN_MODIFY_QP_CMD_CUR_QP_STATE, 1);
+		params.cur_qp_state = cur_state;
+		params.qp_state = new_state;
 	}
 
 	if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
-		params.modify_mask |=
-			BIT(EFA_ADMIN_SQ_DRAINED_ASYNC_NOTIFY_BIT);
+		EFA_SET(&params.modify_mask,
+			EFA_ADMIN_MODIFY_QP_CMD_SQ_DRAINED_ASYNC_NOTIFY, 1);
 		params.sq_drained_async_notify = qp_attr->en_sqd_async_notify;
 	}
 
 	if (qp_attr_mask & IB_QP_QKEY) {
-		params.modify_mask |= BIT(EFA_ADMIN_QKEY_BIT);
+		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_QKEY, 1);
 		params.qkey = qp_attr->qkey;
 	}
 
 	if (qp_attr_mask & IB_QP_SQ_PSN) {
-		params.modify_mask |= BIT(EFA_ADMIN_SQ_PSN_BIT);
+		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_SQ_PSN, 1);
 		params.sq_psn = qp_attr->sq_psn;
 	}
 
+	if (qp_attr_mask & IB_QP_RNR_RETRY) {
+		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_RNR_RETRY,
+			1);
+		params.rnr_retry = qp_attr->rnr_retry;
+	}
+
 	err = efa_com_modify_qp(&dev->edev, &params);
 	if (err)
 		return err;
@@ -884,7 +981,7 @@
 	return efa_com_destroy_cq(&dev->edev, &params);
 }
 
-void efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct efa_dev *dev = to_edev(ibcq->device);
 	struct efa_cq *cq = to_ecq(ibcq);
@@ -893,19 +990,22 @@
 		  "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
 		  cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
 
+	rdma_user_mmap_entry_remove(cq->mmap_entry);
 	efa_destroy_cq_idx(dev, cq->cq_idx);
-	dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size,
-			 DMA_FROM_DEVICE);
+	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
+			DMA_FROM_DEVICE);
+	return 0;
 }
 
 static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
 				 struct efa_ibv_create_cq_resp *resp)
 {
 	resp->q_mmap_size = cq->size;
-	resp->q_mmap_key = mmap_entry_insert(dev, cq->ucontext, cq,
-					     virt_to_phys(cq->cpu_addr),
-					     cq->size, EFA_MMAP_DMA_PAGE);
-	if (resp->q_mmap_key == EFA_MMAP_INVALID)
+	cq->mmap_entry = efa_user_mmap_entry_insert(&cq->ucontext->ibucontext,
+						    virt_to_phys(cq->cpu_addr),
+						    cq->size, EFA_MMAP_DMA_PAGE,
+						    &resp->q_mmap_key);
+	if (!cq->mmap_entry)
 		return -ENOMEM;
 
 	return 0;
@@ -923,7 +1023,6 @@
 	struct efa_dev *dev = to_edev(ibdev);
 	struct efa_ibv_create_cq cmd = {};
 	struct efa_cq *cq = to_ecq(ibcq);
-	bool cq_entry_inserted = false;
 	int entries = attr->cqe;
 	int err;
 
@@ -937,7 +1036,7 @@
 		goto err_out;
 	}
 
-	if (!field_avail(cmd, num_sub_cqs, udata->inlen)) {
+	if (offsetofend(typeof(cmd), num_sub_cqs) > udata->inlen) {
 		ibdev_dbg(ibdev,
 			  "Incompatible ABI params, no input udata\n");
 		err = -EINVAL;
@@ -1012,15 +1111,13 @@
 		goto err_destroy_cq;
 	}
 
-	cq_entry_inserted = true;
-
 	if (udata->outlen) {
 		err = ib_copy_to_udata(udata, &resp,
 				       min(sizeof(resp), udata->outlen));
 		if (err) {
 			ibdev_dbg(ibdev,
 				  "Failed to copy udata for create_cq\n");
-			goto err_destroy_cq;
+			goto err_remove_mmap;
 		}
 	}
 
@@ -1029,15 +1126,16 @@
 
 	return 0;
 
+err_remove_mmap:
+	rdma_user_mmap_entry_remove(cq->mmap_entry);
 err_destroy_cq:
 	efa_destroy_cq_idx(dev, cq->cq_idx);
 err_free_mapped:
-	dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size,
-			 DMA_FROM_DEVICE);
-	if (!cq_entry_inserted)
-		free_pages_exact(cq->cpu_addr, cq->size);
+	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
+			DMA_FROM_DEVICE);
+
 err_out:
-	atomic64_inc(&dev->stats.sw_stats.create_cq_err);
+	atomic64_inc(&dev->stats.create_cq_err);
 	return err;
 }
 
@@ -1054,8 +1152,7 @@
 	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
 		  hp_cnt, pages_in_hp);
 
-	rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap,
-			    BIT(hp_shift))
+	rdma_umem_for_each_dma_block(umem, &biter, BIT(hp_shift))
 		page_list[hp_idx++] = rdma_block_iter_dma_address(&biter);
 
 	return 0;
@@ -1067,7 +1164,7 @@
 	struct page *pg;
 	int i;
 
-	sglist = kcalloc(page_cnt, sizeof(*sglist), GFP_KERNEL);
+	sglist = kmalloc_array(page_cnt, sizeof(*sglist), GFP_KERNEL);
 	if (!sglist)
 		return NULL;
 	sg_init_table(sglist, page_cnt);
@@ -1395,12 +1492,13 @@
 	struct efa_com_reg_mr_params params = {};
 	struct efa_com_reg_mr_result result = {};
 	struct pbl_context pbl;
+	int supp_access_flags;
 	unsigned int pg_sz;
 	struct efa_mr *mr;
 	int inline_size;
 	int err;
 
-	if (udata->inlen &&
+	if (udata && udata->inlen &&
 	    !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
 		ibdev_dbg(&dev->ibdev,
 			  "Incompatible ABI params, udata not cleared\n");
@@ -1408,10 +1506,15 @@
 		goto err_out;
 	}
 
-	if (access_flags & ~EFA_SUPPORTED_ACCESS_FLAGS) {
+	supp_access_flags =
+		IB_ACCESS_LOCAL_WRITE |
+		(EFA_DEV_CAP(dev, RDMA_READ) ? IB_ACCESS_REMOTE_READ : 0);
+
+	access_flags &= ~IB_ACCESS_OPTIONAL;
+	if (access_flags & ~supp_access_flags) {
 		ibdev_dbg(&dev->ibdev,
 			  "Unsupported access flags[%#x], supported[%#x]\n",
-			  access_flags, EFA_SUPPORTED_ACCESS_FLAGS);
+			  access_flags, supp_access_flags);
 		err = -EOPNOTSUPP;
 		goto err_out;
 	}
@@ -1422,7 +1525,7 @@
 		goto err_out;
 	}
 
-	mr->umem = ib_umem_get(udata, start, length, access_flags, 0);
+	mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
 	if (IS_ERR(mr->umem)) {
 		err = PTR_ERR(mr->umem);
 		ibdev_dbg(&dev->ibdev,
@@ -1433,7 +1536,7 @@
 	params.pd = to_epd(ibpd)->pdn;
 	params.iova = virt_addr;
 	params.mr_length_in_bytes = length;
-	params.permissions = access_flags & 0x1;
+	params.permissions = access_flags;
 
 	pg_sz = ib_umem_find_best_pgsz(mr->umem,
 				       dev->dev_attr.page_size_cap,
@@ -1445,9 +1548,8 @@
 		goto err_unmap;
 	}
 
-	params.page_shift = __ffs(pg_sz);
-	params.page_num = DIV_ROUND_UP(length + (start & (pg_sz - 1)),
-				       pg_sz);
+	params.page_shift = order_base_2(pg_sz);
+	params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz);
 
 	ibdev_dbg(&dev->ibdev,
 		  "start %#llx length %#llx params.page_shift %u params.page_num %u\n",
@@ -1486,7 +1588,7 @@
 err_free:
 	kfree(mr);
 err_out:
-	atomic64_inc(&dev->stats.sw_stats.reg_mr_err);
+	atomic64_inc(&dev->stats.reg_mr_err);
 	return ERR_PTR(err);
 }
 
@@ -1537,11 +1639,39 @@
 	return efa_com_dealloc_uar(&dev->edev, &params);
 }
 
+#define EFA_CHECK_USER_COMP(_dev, _comp_mask, _attr, _mask, _attr_str) \
+	(_attr_str = (!(_dev)->dev_attr._attr || ((_comp_mask) & (_mask))) ? \
+		     NULL : #_attr)
+
+static int efa_user_comp_handshake(const struct ib_ucontext *ibucontext,
+				   const struct efa_ibv_alloc_ucontext_cmd *cmd)
+{
+	struct efa_dev *dev = to_edev(ibucontext->device);
+	char *attr_str;
+
+	if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, max_tx_batch,
+				EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH, attr_str))
+		goto err;
+
+	if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, min_sq_depth,
+				EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR,
+				attr_str))
+		goto err;
+
+	return 0;
+
+err:
+	ibdev_dbg(&dev->ibdev, "Userspace handshake failed for %s attribute\n",
+		  attr_str);
+	return -EOPNOTSUPP;
+}
+
 int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata)
 {
 	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
 	struct efa_dev *dev = to_edev(ibucontext->device);
 	struct efa_ibv_alloc_ucontext_resp resp = {};
+	struct efa_ibv_alloc_ucontext_cmd cmd = {};
 	struct efa_com_alloc_uar_result result;
 	int err;
 
@@ -1550,32 +1680,43 @@
 	 * we will ack input fields in our response.
 	 */
 
+	err = ib_copy_from_udata(&cmd, udata,
+				 min(sizeof(cmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(&dev->ibdev,
+			  "Cannot copy udata for alloc_ucontext\n");
+		goto err_out;
+	}
+
+	err = efa_user_comp_handshake(ibucontext, &cmd);
+	if (err)
+		goto err_out;
+
 	err = efa_com_alloc_uar(&dev->edev, &result);
 	if (err)
 		goto err_out;
 
 	ucontext->uarn = result.uarn;
-	xa_init(&ucontext->mmap_xa);
 
 	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE;
 	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_CREATE_AH;
 	resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq;
 	resp.inline_buf_size = dev->dev_attr.inline_buf_size;
 	resp.max_llq_size = dev->dev_attr.max_llq_size;
+	resp.max_tx_batch = dev->dev_attr.max_tx_batch;
+	resp.min_sq_wr = dev->dev_attr.min_sq_depth;
 
-	if (udata && udata->outlen) {
-		err = ib_copy_to_udata(udata, &resp,
-				       min(sizeof(resp), udata->outlen));
-		if (err)
-			goto err_dealloc_uar;
-	}
+	err = ib_copy_to_udata(udata, &resp,
+			       min(sizeof(resp), udata->outlen));
+	if (err)
+		goto err_dealloc_uar;
 
 	return 0;
 
 err_dealloc_uar:
 	efa_dealloc_uar(dev, result.uarn);
 err_out:
-	atomic64_inc(&dev->stats.sw_stats.alloc_ucontext_err);
+	atomic64_inc(&dev->stats.alloc_ucontext_err);
 	return err;
 }
 
@@ -1584,38 +1725,53 @@
 	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
 	struct efa_dev *dev = to_edev(ibucontext->device);
 
-	mmap_entries_remove_free(dev, ucontext);
 	efa_dealloc_uar(dev, ucontext->uarn);
 }
 
-static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
-		      struct vm_area_struct *vma, u64 key, u64 length)
+void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
 {
-	struct efa_mmap_entry *entry;
-	unsigned long va;
-	u64 pfn;
-	int err;
+	struct efa_user_mmap_entry *entry = to_emmap(rdma_entry);
 
-	entry = mmap_entry_get(dev, ucontext, key, length);
-	if (!entry) {
-		ibdev_dbg(&dev->ibdev, "key[%#llx] does not have valid entry\n",
-			  key);
+	kfree(entry);
+}
+
+static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
+		      struct vm_area_struct *vma)
+{
+	struct rdma_user_mmap_entry *rdma_entry;
+	struct efa_user_mmap_entry *entry;
+	unsigned long va;
+	int err = 0;
+	u64 pfn;
+
+	rdma_entry = rdma_user_mmap_entry_get(&ucontext->ibucontext, vma);
+	if (!rdma_entry) {
+		ibdev_dbg(&dev->ibdev,
+			  "pgoff[%#lx] does not have valid entry\n",
+			  vma->vm_pgoff);
+		atomic64_inc(&dev->stats.mmap_err);
 		return -EINVAL;
 	}
+	entry = to_emmap(rdma_entry);
 
 	ibdev_dbg(&dev->ibdev,
-		  "Mapping address[%#llx], length[%#llx], mmap_flag[%d]\n",
-		  entry->address, length, entry->mmap_flag);
+		  "Mapping address[%#llx], length[%#zx], mmap_flag[%d]\n",
+		  entry->address, rdma_entry->npages * PAGE_SIZE,
+		  entry->mmap_flag);
 
 	pfn = entry->address >> PAGE_SHIFT;
 	switch (entry->mmap_flag) {
 	case EFA_MMAP_IO_NC:
-		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn, length,
-					pgprot_noncached(vma->vm_page_prot));
+		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
+					entry->rdma_entry.npages * PAGE_SIZE,
+					pgprot_noncached(vma->vm_page_prot),
+					rdma_entry);
 		break;
 	case EFA_MMAP_IO_WC:
-		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn, length,
-					pgprot_writecombine(vma->vm_page_prot));
+		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
+					entry->rdma_entry.npages * PAGE_SIZE,
+					pgprot_writecombine(vma->vm_page_prot),
+					rdma_entry);
 		break;
 	case EFA_MMAP_DMA_PAGE:
 		for (va = vma->vm_start; va < vma->vm_end;
@@ -1632,12 +1788,14 @@
 	if (err) {
 		ibdev_dbg(
 			&dev->ibdev,
-			"Couldn't mmap address[%#llx] length[%#llx] mmap_flag[%d] err[%d]\n",
-			entry->address, length, entry->mmap_flag, err);
-		return err;
+			"Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n",
+			entry->address, rdma_entry->npages * PAGE_SIZE,
+			entry->mmap_flag, err);
+		atomic64_inc(&dev->stats.mmap_err);
 	}
 
-	return 0;
+	rdma_user_mmap_entry_put(rdma_entry);
+	return err;
 }
 
 int efa_mmap(struct ib_ucontext *ibucontext,
@@ -1645,26 +1803,13 @@
 {
 	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
 	struct efa_dev *dev = to_edev(ibucontext->device);
-	u64 length = vma->vm_end - vma->vm_start;
-	u64 key = vma->vm_pgoff << PAGE_SHIFT;
+	size_t length = vma->vm_end - vma->vm_start;
 
 	ibdev_dbg(&dev->ibdev,
-		  "start %#lx, end %#lx, length = %#llx, key = %#llx\n",
-		  vma->vm_start, vma->vm_end, length, key);
+		  "start %#lx, end %#lx, length = %#zx, pgoff = %#lx\n",
+		  vma->vm_start, vma->vm_end, length, vma->vm_pgoff);
 
-	if (length % PAGE_SIZE != 0 || !(vma->vm_flags & VM_SHARED)) {
-		ibdev_dbg(&dev->ibdev,
-			  "length[%#llx] is not page size aligned[%#lx] or VM_SHARED is not set [%#lx]\n",
-			  length, PAGE_SIZE, vma->vm_flags);
-		return -EINVAL;
-	}
-
-	if (vma->vm_flags & VM_EXEC) {
-		ibdev_dbg(&dev->ibdev, "Mapping executable pages is not permitted\n");
-		return -EPERM;
-	}
-
-	return __efa_mmap(dev, ucontext, vma, key, length);
+	return __efa_mmap(dev, ucontext, vma);
 }
 
 static int efa_ah_destroy(struct efa_dev *dev, struct efa_ah *ah)
@@ -1678,10 +1823,10 @@
 }
 
 int efa_create_ah(struct ib_ah *ibah,
-		  struct rdma_ah_attr *ah_attr,
-		  u32 flags,
+		  struct rdma_ah_init_attr *init_attr,
 		  struct ib_udata *udata)
 {
+	struct rdma_ah_attr *ah_attr = init_attr->ah_attr;
 	struct efa_dev *dev = to_edev(ibah->device);
 	struct efa_com_create_ah_params params = {};
 	struct efa_ibv_create_ah_resp resp = {};
@@ -1689,7 +1834,7 @@
 	struct efa_ah *ah = to_eah(ibah);
 	int err;
 
-	if (!(flags & RDMA_CREATE_AH_SLEEPABLE)) {
+	if (!(init_attr->flags & RDMA_CREATE_AH_SLEEPABLE)) {
 		ibdev_dbg(&dev->ibdev,
 			  "Create address handle is not supported in atomic context\n");
 		err = -EOPNOTSUPP;
@@ -1731,11 +1876,11 @@
 err_destroy_ah:
 	efa_ah_destroy(dev, ah);
 err_out:
-	atomic64_inc(&dev->stats.sw_stats.create_ah_err);
+	atomic64_inc(&dev->stats.create_ah_err);
 	return err;
 }
 
-void efa_destroy_ah(struct ib_ah *ibah, u32 flags)
+int efa_destroy_ah(struct ib_ah *ibah, u32 flags)
 {
 	struct efa_dev *dev = to_edev(ibah->pd->device);
 	struct efa_ah *ah = to_eah(ibah);
@@ -1745,10 +1890,11 @@
 	if (!(flags & RDMA_DESTROY_AH_SLEEPABLE)) {
 		ibdev_dbg(&dev->ibdev,
 			  "Destroy address handle is not supported in atomic context\n");
-		return;
+		return -EOPNOTSUPP;
 	}
 
 	efa_ah_destroy(dev, ah);
+	return 0;
 }
 
 struct rdma_hw_stats *efa_alloc_hw_stats(struct ib_device *ibdev, u8 port_num)
@@ -1764,13 +1910,15 @@
 	struct efa_com_get_stats_params params = {};
 	union efa_com_get_stats_result result;
 	struct efa_dev *dev = to_edev(ibdev);
+	struct efa_com_rdma_read_stats *rrs;
+	struct efa_com_messages_stats *ms;
 	struct efa_com_basic_stats *bs;
 	struct efa_com_stats_admin *as;
 	struct efa_stats *s;
 	int err;
 
-	params.type = EFA_ADMIN_GET_STATS_TYPE_BASIC;
 	params.scope = EFA_ADMIN_GET_STATS_SCOPE_ALL;
+	params.type = EFA_ADMIN_GET_STATS_TYPE_BASIC;
 
 	err = efa_com_get_stats(&dev->edev, &params, &result);
 	if (err)
@@ -1783,18 +1931,44 @@
 	stats->value[EFA_RX_PKTS] = bs->rx_pkts;
 	stats->value[EFA_RX_DROPS] = bs->rx_drops;
 
+	params.type = EFA_ADMIN_GET_STATS_TYPE_MESSAGES;
+	err = efa_com_get_stats(&dev->edev, &params, &result);
+	if (err)
+		return err;
+
+	ms = &result.messages_stats;
+	stats->value[EFA_SEND_BYTES] = ms->send_bytes;
+	stats->value[EFA_SEND_WRS] = ms->send_wrs;
+	stats->value[EFA_RECV_BYTES] = ms->recv_bytes;
+	stats->value[EFA_RECV_WRS] = ms->recv_wrs;
+
+	params.type = EFA_ADMIN_GET_STATS_TYPE_RDMA_READ;
+	err = efa_com_get_stats(&dev->edev, &params, &result);
+	if (err)
+		return err;
+
+	rrs = &result.rdma_read_stats;
+	stats->value[EFA_RDMA_READ_WRS] = rrs->read_wrs;
+	stats->value[EFA_RDMA_READ_BYTES] = rrs->read_bytes;
+	stats->value[EFA_RDMA_READ_WR_ERR] = rrs->read_wr_err;
+	stats->value[EFA_RDMA_READ_RESP_BYTES] = rrs->read_resp_bytes;
+
 	as = &dev->edev.aq.stats;
 	stats->value[EFA_SUBMITTED_CMDS] = atomic64_read(&as->submitted_cmd);
 	stats->value[EFA_COMPLETED_CMDS] = atomic64_read(&as->completed_cmd);
+	stats->value[EFA_CMDS_ERR] = atomic64_read(&as->cmd_err);
 	stats->value[EFA_NO_COMPLETION_CMDS] = atomic64_read(&as->no_completion);
 
 	s = &dev->stats;
 	stats->value[EFA_KEEP_ALIVE_RCVD] = atomic64_read(&s->keep_alive_rcvd);
-	stats->value[EFA_ALLOC_PD_ERR] = atomic64_read(&s->sw_stats.alloc_pd_err);
-	stats->value[EFA_CREATE_QP_ERR] = atomic64_read(&s->sw_stats.create_qp_err);
-	stats->value[EFA_REG_MR_ERR] = atomic64_read(&s->sw_stats.reg_mr_err);
-	stats->value[EFA_ALLOC_UCONTEXT_ERR] = atomic64_read(&s->sw_stats.alloc_ucontext_err);
-	stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->sw_stats.create_ah_err);
+	stats->value[EFA_ALLOC_PD_ERR] = atomic64_read(&s->alloc_pd_err);
+	stats->value[EFA_CREATE_QP_ERR] = atomic64_read(&s->create_qp_err);
+	stats->value[EFA_CREATE_CQ_ERR] = atomic64_read(&s->create_cq_err);
+	stats->value[EFA_REG_MR_ERR] = atomic64_read(&s->reg_mr_err);
+	stats->value[EFA_ALLOC_UCONTEXT_ERR] =
+		atomic64_read(&s->alloc_ucontext_err);
+	stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->create_ah_err);
+	stats->value[EFA_MMAP_ERR] = atomic64_read(&s->mmap_err);
 
 	return ARRAY_SIZE(efa_stats_names);
 }
diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig
index 0653f4f..519866b 100644
--- a/drivers/infiniband/hw/hfi1/Kconfig
+++ b/drivers/infiniband/hw/hfi1/Kconfig
@@ -5,19 +5,19 @@
 	select MMU_NOTIFIER
 	select CRC32
 	select I2C_ALGOBIT
-	---help---
+	help
 	This is a low-level driver for Intel OPA Gen1 adapter.
 config HFI1_DEBUG_SDMA_ORDER
 	bool "HFI1 SDMA Order debug"
 	depends on INFINIBAND_HFI1
 	default n
-	---help---
+	help
 	This is a debug flag to test for out of order
 	sdma completions for unit testing
 config SDMA_VERBOSITY
 	bool "Config SDMA Verbosity"
 	depends on INFINIBAND_HFI1
 	default n
-	---help---
+	help
 	This is a configuration flag to enable verbose
 	SDMA debug
diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile
index 0405d26..2e89ec1 100644
--- a/drivers/infiniband/hw/hfi1/Makefile
+++ b/drivers/infiniband/hw/hfi1/Makefile
@@ -22,9 +22,13 @@
 	init.o \
 	intr.o \
 	iowait.o \
+	ipoib_main.o \
+	ipoib_rx.o \
+	ipoib_tx.o \
 	mad.o \
 	mmu_rb.o \
 	msix.o \
+	netdev_rx.o \
 	opfn.o \
 	pcie.o \
 	pio.o \
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index 1aeea5d..04b1e8f 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -64,6 +64,7 @@
 static const char * const irq_type_names[] = {
 	"SDMA",
 	"RCVCTXT",
+	"NETDEVCTXT",
 	"GENERAL",
 	"OTHER",
 };
@@ -631,22 +632,11 @@
  */
 int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 {
-	int node = pcibus_to_node(dd->pcidev->bus);
 	struct hfi1_affinity_node *entry;
 	const struct cpumask *local_mask;
 	int curr_cpu, possible, i, ret;
 	bool new_entry = false;
 
-	/*
-	 * If the BIOS does not have the NUMA node information set, select
-	 * NUMA 0 so we get consistent performance.
-	 */
-	if (node < 0) {
-		dd_dev_err(dd, "Invalid PCI NUMA node. Performance may be affected\n");
-		node = 0;
-	}
-	dd->node = node;
-
 	local_mask = cpumask_of_node(dd->node);
 	if (cpumask_first(local_mask) >= nr_cpu_ids)
 		local_mask = topology_core_cpumask(0);
@@ -659,7 +649,7 @@
 	 * create an entry in the global affinity structure and initialize it.
 	 */
 	if (!entry) {
-		entry = node_affinity_allocate(node);
+		entry = node_affinity_allocate(dd->node);
 		if (!entry) {
 			dd_dev_err(dd,
 				   "Unable to allocate global affinity node\n");
@@ -750,6 +740,7 @@
 	if (new_entry)
 		node_affinity_add_tail(entry);
 
+	dd->affinity_entry = entry;
 	mutex_unlock(&node_affinity.lock);
 
 	return 0;
@@ -765,10 +756,9 @@
 {
 	struct hfi1_affinity_node *entry;
 
-	if (dd->node < 0)
-		return;
-
 	mutex_lock(&node_affinity.lock);
+	if (!dd->affinity_entry)
+		goto unlock;
 	entry = node_affinity_lookup(dd->node);
 	if (!entry)
 		goto unlock;
@@ -779,8 +769,8 @@
 	 */
 	_dev_comp_vect_cpu_mask_clean_up(dd, entry);
 unlock:
+	dd->affinity_entry = NULL;
 	mutex_unlock(&node_affinity.lock);
-	dd->node = NUMA_NO_NODE;
 }
 
 /*
@@ -915,6 +905,11 @@
 			set = &entry->rcv_intr;
 		scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
 		break;
+	case IRQ_NETDEVCTXT:
+		rcd = (struct hfi1_ctxtdata *)msix->arg;
+		set = &entry->def_intr;
+		scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
+		break;
 	default:
 		dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type);
 		return -EINVAL;
@@ -987,6 +982,10 @@
 		if (rcd->ctxt != HFI1_CTRL_CTXT)
 			set = &entry->rcv_intr;
 		break;
+	case IRQ_NETDEVCTXT:
+		rcd = (struct hfi1_ctxtdata *)msix->arg;
+		set = &entry->def_intr;
+		break;
 	default:
 		mutex_unlock(&node_affinity.lock);
 		return;
diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h
index 6a7e6ea..f94ed5d 100644
--- a/drivers/infiniband/hw/hfi1/affinity.h
+++ b/drivers/infiniband/hw/hfi1/affinity.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -52,6 +52,7 @@
 enum irq_type {
 	IRQ_SDMA,
 	IRQ_RCVCTXT,
+	IRQ_NETDEVCTXT,
 	IRQ_GENERAL,
 	IRQ_OTHER
 };
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c
index 10924f1..88476a1 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -66,10 +66,7 @@
 #include "affinity.h"
 #include "debugfs.h"
 #include "fault.h"
-
-uint kdeth_qp;
-module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO);
-MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix");
+#include "netdev.h"
 
 uint num_vls = HFI1_MAX_VLS_SUPPORTED;
 module_param(num_vls, uint, S_IRUGO);
@@ -128,13 +125,15 @@
 
 /*
  * RSM instance allocation
- *   0 - Verbs
- *   1 - User Fecn Handling
- *   2 - Vnic
+ *   0 - User Fecn Handling
+ *   1 - Vnic
+ *   2 - AIP
+ *   3 - Verbs
  */
-#define RSM_INS_VERBS             0
-#define RSM_INS_FECN              1
-#define RSM_INS_VNIC              2
+#define RSM_INS_FECN              0
+#define RSM_INS_VNIC              1
+#define RSM_INS_AIP               2
+#define RSM_INS_VERBS             3
 
 /* Bit offset into the GUID which carries HFI id information */
 #define GUID_HFI_INDEX_SHIFT     39
@@ -175,6 +174,25 @@
 /* QPN[m+n:1] QW 1, OFFSET 1 */
 #define QPN_SELECT_OFFSET      ((1ull << QW_SHIFT) | (1ull))
 
+/* RSM fields for AIP */
+/* LRH.BTH above is reused for this rule */
+
+/* BTH.DESTQP: QW 1, OFFSET 16 for match */
+#define BTH_DESTQP_QW           1ull
+#define BTH_DESTQP_BIT_OFFSET   16ull
+#define BTH_DESTQP_OFFSET(off) ((BTH_DESTQP_QW << QW_SHIFT) | (off))
+#define BTH_DESTQP_MATCH_OFFSET BTH_DESTQP_OFFSET(BTH_DESTQP_BIT_OFFSET)
+#define BTH_DESTQP_MASK         0xFFull
+#define BTH_DESTQP_VALUE        0x81ull
+
+/* DETH.SQPN: QW 1 Offset 56 for select */
+/* We use 8 most significant Soure QPN bits as entropy fpr AIP */
+#define DETH_AIP_SQPN_QW 3ull
+#define DETH_AIP_SQPN_BIT_OFFSET 56ull
+#define DETH_AIP_SQPN_OFFSET(off) ((DETH_AIP_SQPN_QW << QW_SHIFT) | (off))
+#define DETH_AIP_SQPN_SELECT_OFFSET \
+	DETH_AIP_SQPN_OFFSET(DETH_AIP_SQPN_BIT_OFFSET)
+
 /* RSM fields for Vnic */
 /* L2_TYPE: QW 0, OFFSET 61 - for match */
 #define L2_TYPE_QW             0ull
@@ -6873,7 +6891,7 @@
 		}
 		rcvmask = HFI1_RCVCTRL_CTXT_ENB;
 		/* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */
-		rcvmask |= rcd->rcvhdrtail_kvaddr ?
+		rcvmask |= hfi1_rcvhdrtail_kvaddr(rcd) ?
 			HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS;
 		hfi1_rcvctrl(dd, rcvmask, rcd);
 		hfi1_rcd_put(rcd);
@@ -7299,11 +7317,11 @@
 	case 1: return OPA_LINK_WIDTH_1X;
 	case 2: return OPA_LINK_WIDTH_2X;
 	case 3: return OPA_LINK_WIDTH_3X;
+	case 4: return OPA_LINK_WIDTH_4X;
 	default:
 		dd_dev_info(dd, "%s: invalid width %d, using 4\n",
 			    __func__, width);
-		/* fall through */
-	case 4: return OPA_LINK_WIDTH_4X;
+		return OPA_LINK_WIDTH_4X;
 	}
 }
 
@@ -7358,12 +7376,13 @@
 		case 0:
 			dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G;
 			break;
+		case 1:
+			dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
+			break;
 		default:
 			dd_dev_err(dd,
 				   "%s: unexpected max rate %d, using 25Gb\n",
 				   __func__, (int)max_rate);
-			/* fall through */
-		case 1:
 			dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G;
 			break;
 		}
@@ -8405,20 +8424,107 @@
 static inline int check_packet_present(struct hfi1_ctxtdata *rcd)
 {
 	u32 tail;
-	int present;
 
-	if (!rcd->rcvhdrtail_kvaddr)
-		present = (rcd->seq_cnt ==
-				rhf_rcv_seq(rhf_to_cpu(get_rhf_addr(rcd))));
-	else /* is RDMA rtail */
-		present = (rcd->head != get_rcvhdrtail(rcd));
-
-	if (present)
+	if (hfi1_packet_present(rcd))
 		return 1;
 
 	/* fall back to a CSR read, correct indpendent of DMA_RTAIL */
 	tail = (u32)read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
-	return rcd->head != tail;
+	return hfi1_rcd_head(rcd) != tail;
+}
+
+/**
+ * Common code for receive contexts interrupt handlers.
+ * Update traces, increment kernel IRQ counter and
+ * setup ASPM when needed.
+ */
+static void receive_interrupt_common(struct hfi1_ctxtdata *rcd)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+
+	trace_hfi1_receive_interrupt(dd, rcd);
+	this_cpu_inc(*dd->int_counter);
+	aspm_ctx_disable(rcd);
+}
+
+/**
+ * __hfi1_rcd_eoi_intr() - Make HW issue receive interrupt
+ * when there are packets present in the queue. When calling
+ * with interrupts enabled please use hfi1_rcd_eoi_intr.
+ *
+ * @rcd: valid receive context
+ */
+static void __hfi1_rcd_eoi_intr(struct hfi1_ctxtdata *rcd)
+{
+	if (!rcd->rcvhdrq)
+		return;
+	clear_recv_intr(rcd);
+	if (check_packet_present(rcd))
+		force_recv_intr(rcd);
+}
+
+/**
+ * hfi1_rcd_eoi_intr() - End of Interrupt processing action
+ *
+ * @rcd: Ptr to hfi1_ctxtdata of receive context
+ *
+ *  Hold IRQs so we can safely clear the interrupt and
+ *  recheck for a packet that may have arrived after the previous
+ *  check and the interrupt clear.  If a packet arrived, force another
+ *  interrupt. This routine can be called at the end of receive packet
+ *  processing in interrupt service routines, interrupt service thread
+ *  and softirqs
+ */
+static void hfi1_rcd_eoi_intr(struct hfi1_ctxtdata *rcd)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__hfi1_rcd_eoi_intr(rcd);
+	local_irq_restore(flags);
+}
+
+/**
+ * hfi1_netdev_rx_napi - napi poll function to move eoi inline
+ * @napi - pointer to napi object
+ * @budget - netdev budget
+ */
+int hfi1_netdev_rx_napi(struct napi_struct *napi, int budget)
+{
+	struct hfi1_netdev_rxq *rxq = container_of(napi,
+			struct hfi1_netdev_rxq, napi);
+	struct hfi1_ctxtdata *rcd = rxq->rcd;
+	int work_done = 0;
+
+	work_done = rcd->do_interrupt(rcd, budget);
+
+	if (work_done < budget) {
+		napi_complete_done(napi, work_done);
+		hfi1_rcd_eoi_intr(rcd);
+	}
+
+	return work_done;
+}
+
+/* Receive packet napi handler for netdevs VNIC and AIP  */
+irqreturn_t receive_context_interrupt_napi(int irq, void *data)
+{
+	struct hfi1_ctxtdata *rcd = data;
+
+	receive_interrupt_common(rcd);
+
+	if (likely(rcd->napi)) {
+		if (likely(napi_schedule_prep(rcd->napi)))
+			__napi_schedule_irqoff(rcd->napi);
+		else
+			__hfi1_rcd_eoi_intr(rcd);
+	} else {
+		WARN_ONCE(1, "Napi IRQ handler without napi set up ctxt=%d\n",
+			  rcd->ctxt);
+		__hfi1_rcd_eoi_intr(rcd);
+	}
+
+	return IRQ_HANDLED;
 }
 
 /*
@@ -8432,13 +8538,9 @@
 irqreturn_t receive_context_interrupt(int irq, void *data)
 {
 	struct hfi1_ctxtdata *rcd = data;
-	struct hfi1_devdata *dd = rcd->dd;
 	int disposition;
-	int present;
 
-	trace_hfi1_receive_interrupt(dd, rcd);
-	this_cpu_inc(*dd->int_counter);
-	aspm_ctx_disable(rcd);
+	receive_interrupt_common(rcd);
 
 	/* receive interrupt remains blocked while processing packets */
 	disposition = rcd->do_interrupt(rcd, 0);
@@ -8451,17 +8553,7 @@
 	if (disposition == RCV_PKT_LIMIT)
 		return IRQ_WAKE_THREAD;
 
-	/*
-	 * The packet processor detected no more packets.  Clear the receive
-	 * interrupt and recheck for a packet packet that may have arrived
-	 * after the previous check and interrupt clear.  If a packet arrived,
-	 * force another interrupt.
-	 */
-	clear_recv_intr(rcd);
-	present = check_packet_present(rcd);
-	if (present)
-		force_recv_intr(rcd);
-
+	__hfi1_rcd_eoi_intr(rcd);
 	return IRQ_HANDLED;
 }
 
@@ -8472,24 +8564,11 @@
 irqreturn_t receive_context_thread(int irq, void *data)
 {
 	struct hfi1_ctxtdata *rcd = data;
-	int present;
 
 	/* receive interrupt is still blocked from the IRQ handler */
 	(void)rcd->do_interrupt(rcd, 1);
 
-	/*
-	 * The packet processor will only return if it detected no more
-	 * packets.  Hold IRQs here so we can safely clear the interrupt and
-	 * recheck for a packet that may have arrived after the previous
-	 * check and the interrupt clear.  If a packet arrived, force another
-	 * interrupt.
-	 */
-	local_irq_disable();
-	clear_recv_intr(rcd);
-	present = check_packet_present(rcd);
-	if (present)
-		force_recv_intr(rcd);
-	local_irq_enable();
+	hfi1_rcd_eoi_intr(rcd);
 
 	return IRQ_HANDLED;
 }
@@ -10060,7 +10139,7 @@
 	 * the first kernel context would have been allocated by now so
 	 * we are guaranteed a valid value.
 	 */
-	return (dd->rcd[0]->rcvhdrqentsize - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2;
+	return (get_hdrqentsize(dd->rcd[0]) - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2;
 }
 
 /*
@@ -10105,7 +10184,7 @@
 		thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50),
 			    sc_mtu_to_threshold(dd->vld[i].sc,
 						dd->vld[i].mtu,
-						dd->rcd[0]->rcvhdrqentsize));
+						get_hdrqentsize(dd->rcd[0])));
 		for (j = 0; j < INIT_SC_PER_VL; j++)
 			sc_set_cr_threshold(
 					pio_select_send_context_vl(dd, j, i),
@@ -11832,7 +11911,7 @@
 	head = (read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD)
 		& RCV_HDR_HEAD_HEAD_SMASK) >> RCV_HDR_HEAD_HEAD_SHIFT;
 
-	if (rcd->rcvhdrtail_kvaddr)
+	if (hfi1_rcvhdrtail_kvaddr(rcd))
 		tail = get_rcvhdrtail(rcd);
 	else
 		tail = read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL);
@@ -11876,6 +11955,84 @@
 	return 0x1;	/* if invalid, go with the minimum size */
 }
 
+/**
+ * encode_rcv_header_entry_size - return chip specific encoding for size
+ * @size: size in dwords
+ *
+ * Convert a receive header entry size that to the encoding used in the CSR.
+ *
+ * Return a zero if the given size is invalid, otherwise the encoding.
+ */
+u8 encode_rcv_header_entry_size(u8 size)
+{
+	/* there are only 3 valid receive header entry sizes */
+	if (size == 2)
+		return 1;
+	if (size == 16)
+		return 2;
+	if (size == 32)
+		return 4;
+	return 0; /* invalid */
+}
+
+/**
+ * hfi1_validate_rcvhdrcnt - validate hdrcnt
+ * @dd: the device data
+ * @thecnt: the header count
+ */
+int hfi1_validate_rcvhdrcnt(struct hfi1_devdata *dd, uint thecnt)
+{
+	if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
+		dd_dev_err(dd, "Receive header queue count too small\n");
+		return -EINVAL;
+	}
+
+	if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) {
+		dd_dev_err(dd,
+			   "Receive header queue count cannot be greater than %u\n",
+			   HFI1_MAX_HDRQ_EGRBUF_CNT);
+		return -EINVAL;
+	}
+
+	if (thecnt % HDRQ_INCREMENT) {
+		dd_dev_err(dd, "Receive header queue count %d must be divisible by %lu\n",
+			   thecnt, HDRQ_INCREMENT);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * set_hdrq_regs - set header queue registers for context
+ * @dd: the device data
+ * @ctxt: the context
+ * @entsize: the dword entry size
+ * @hdrcnt: the number of header entries
+ */
+void set_hdrq_regs(struct hfi1_devdata *dd, u8 ctxt, u8 entsize, u16 hdrcnt)
+{
+	u64 reg;
+
+	reg = (((u64)hdrcnt >> HDRQ_SIZE_SHIFT) & RCV_HDR_CNT_CNT_MASK) <<
+	      RCV_HDR_CNT_CNT_SHIFT;
+	write_kctxt_csr(dd, ctxt, RCV_HDR_CNT, reg);
+	reg = ((u64)encode_rcv_header_entry_size(entsize) &
+	       RCV_HDR_ENT_SIZE_ENT_SIZE_MASK) <<
+	      RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT;
+	write_kctxt_csr(dd, ctxt, RCV_HDR_ENT_SIZE, reg);
+	reg = ((u64)DEFAULT_RCVHDRSIZE & RCV_HDR_SIZE_HDR_SIZE_MASK) <<
+	      RCV_HDR_SIZE_HDR_SIZE_SHIFT;
+	write_kctxt_csr(dd, ctxt, RCV_HDR_SIZE, reg);
+
+	/*
+	 * Program dummy tail address for every receive context
+	 * before enabling any receive context
+	 */
+	write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
+			dd->rcvhdrtail_dummy_dma);
+}
+
 void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op,
 		  struct hfi1_ctxtdata *rcd)
 {
@@ -11897,13 +12054,13 @@
 		/* reset the tail and hdr addresses, and sequence count */
 		write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR,
 				rcd->rcvhdrq_dma);
-		if (rcd->rcvhdrtail_kvaddr)
+		if (hfi1_rcvhdrtail_kvaddr(rcd))
 			write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR,
 					rcd->rcvhdrqtailaddr_dma);
-		rcd->seq_cnt = 1;
+		hfi1_set_seq_cnt(rcd, 1);
 
 		/* reset the cached receive header queue head value */
-		rcd->head = 0;
+		hfi1_set_rcd_head(rcd, 0);
 
 		/*
 		 * Zero the receive header queue so we don't get false
@@ -11983,7 +12140,7 @@
 			      IS_RCVAVAIL_START + rcd->ctxt, false);
 		rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK;
 	}
-	if ((op & HFI1_RCVCTRL_TAILUPD_ENB) && rcd->rcvhdrtail_kvaddr)
+	if ((op & HFI1_RCVCTRL_TAILUPD_ENB) && hfi1_rcvhdrtail_kvaddr(rcd))
 		rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK;
 	if (op & HFI1_RCVCTRL_TAILUPD_DIS) {
 		/* See comment on RcvCtxtCtrl.TailUpd above */
@@ -12724,11 +12881,6 @@
 static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate)
 {
 	switch (chip_lstate) {
-	default:
-		dd_dev_err(dd,
-			   "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
-			   chip_lstate);
-		/* fall through */
 	case LSTATE_DOWN:
 		return IB_PORT_DOWN;
 	case LSTATE_INIT:
@@ -12737,6 +12889,11 @@
 		return IB_PORT_ARMED;
 	case LSTATE_ACTIVE:
 		return IB_PORT_ACTIVE;
+	default:
+		dd_dev_err(dd,
+			   "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n",
+			   chip_lstate);
+		return IB_PORT_DOWN;
 	}
 }
 
@@ -12744,10 +12901,6 @@
 {
 	/* look at the HFI meta-states only */
 	switch (chip_pstate & 0xf0) {
-	default:
-		dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
-			   chip_pstate);
-		/* fall through */
 	case PLS_DISABLED:
 		return IB_PORTPHYSSTATE_DISABLED;
 	case PLS_OFFLINE:
@@ -12760,6 +12913,10 @@
 		return IB_PORTPHYSSTATE_LINKUP;
 	case PLS_PHYTEST:
 		return IB_PORTPHYSSTATE_PHY_TEST;
+	default:
+		dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n",
+			   chip_pstate);
+		return IB_PORTPHYSSTATE_DISABLED;
 	}
 }
 
@@ -13237,13 +13394,12 @@
  *                             in array of contexts
  *	freectxts  - number of free user contexts
  *	num_send_contexts - number of PIO send contexts being used
- *	num_vnic_contexts - number of contexts reserved for VNIC
+ *	num_netdev_contexts - number of contexts reserved for netdev
  */
 static int set_up_context_variables(struct hfi1_devdata *dd)
 {
 	unsigned long num_kernel_contexts;
-	u16 num_vnic_contexts = HFI1_NUM_VNIC_CTXT;
-	int total_contexts;
+	u16 num_netdev_contexts;
 	int ret;
 	unsigned ngroups;
 	int rmt_count;
@@ -13280,13 +13436,6 @@
 		num_kernel_contexts = send_contexts - num_vls - 1;
 	}
 
-	/* Accommodate VNIC contexts if possible */
-	if ((num_kernel_contexts + num_vnic_contexts) > rcv_contexts) {
-		dd_dev_err(dd, "No receive contexts available for VNIC\n");
-		num_vnic_contexts = 0;
-	}
-	total_contexts = num_kernel_contexts + num_vnic_contexts;
-
 	/*
 	 * User contexts:
 	 *	- default to 1 user context per real (non-HT) CPU core if
@@ -13299,28 +13448,32 @@
 	/*
 	 * Adjust the counts given a global max.
 	 */
-	if (total_contexts + n_usr_ctxts > rcv_contexts) {
+	if (num_kernel_contexts + n_usr_ctxts > rcv_contexts) {
 		dd_dev_err(dd,
-			   "Reducing # user receive contexts to: %d, from %u\n",
-			   rcv_contexts - total_contexts,
+			   "Reducing # user receive contexts to: %u, from %u\n",
+			   (u32)(rcv_contexts - num_kernel_contexts),
 			   n_usr_ctxts);
 		/* recalculate */
-		n_usr_ctxts = rcv_contexts - total_contexts;
+		n_usr_ctxts = rcv_contexts - num_kernel_contexts;
 	}
 
+	num_netdev_contexts =
+		hfi1_num_netdev_contexts(dd, rcv_contexts -
+					 (num_kernel_contexts + n_usr_ctxts),
+					 &node_affinity.real_cpu_mask);
 	/*
 	 * The RMT entries are currently allocated as shown below:
 	 * 1. QOS (0 to 128 entries);
 	 * 2. FECN (num_kernel_context - 1 + num_user_contexts +
-	 *    num_vnic_contexts);
-	 * 3. VNIC (num_vnic_contexts).
-	 * It should be noted that FECN oversubscribe num_vnic_contexts
-	 * entries of RMT because both VNIC and PSM could allocate any receive
+	 *    num_netdev_contexts);
+	 * 3. netdev (num_netdev_contexts).
+	 * It should be noted that FECN oversubscribe num_netdev_contexts
+	 * entries of RMT because both netdev and PSM could allocate any receive
 	 * context between dd->first_dyn_alloc_text and dd->num_rcv_contexts,
 	 * and PSM FECN must reserve an RMT entry for each possible PSM receive
 	 * context.
 	 */
-	rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_vnic_contexts * 2);
+	rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_netdev_contexts * 2);
 	if (HFI1_CAP_IS_KSET(TID_RDMA))
 		rmt_count += num_kernel_contexts - 1;
 	if (rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
@@ -13333,21 +13486,20 @@
 		n_usr_ctxts = user_rmt_reduced;
 	}
 
-	total_contexts += n_usr_ctxts;
-
-	/* the first N are kernel contexts, the rest are user/vnic contexts */
-	dd->num_rcv_contexts = total_contexts;
+	/* the first N are kernel contexts, the rest are user/netdev contexts */
+	dd->num_rcv_contexts =
+		num_kernel_contexts + n_usr_ctxts + num_netdev_contexts;
 	dd->n_krcv_queues = num_kernel_contexts;
 	dd->first_dyn_alloc_ctxt = num_kernel_contexts;
-	dd->num_vnic_contexts = num_vnic_contexts;
+	dd->num_netdev_contexts = num_netdev_contexts;
 	dd->num_user_contexts = n_usr_ctxts;
 	dd->freectxts = n_usr_ctxts;
 	dd_dev_info(dd,
-		    "rcv contexts: chip %d, used %d (kernel %d, vnic %u, user %u)\n",
+		    "rcv contexts: chip %d, used %d (kernel %d, netdev %u, user %u)\n",
 		    rcv_contexts,
 		    (int)dd->num_rcv_contexts,
 		    (int)dd->n_krcv_queues,
-		    dd->num_vnic_contexts,
+		    dd->num_netdev_contexts,
 		    dd->num_user_contexts);
 
 	/*
@@ -14026,21 +14178,12 @@
 
 static void init_kdeth_qp(struct hfi1_devdata *dd)
 {
-	/* user changed the KDETH_QP */
-	if (kdeth_qp != 0 && kdeth_qp >= 0xff) {
-		/* out of range or illegal value */
-		dd_dev_err(dd, "Invalid KDETH queue pair prefix, ignoring");
-		kdeth_qp = 0;
-	}
-	if (kdeth_qp == 0)	/* not set, or failed range check */
-		kdeth_qp = DEFAULT_KDETH_QP;
-
 	write_csr(dd, SEND_BTH_QP,
-		  (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK) <<
+		  (RVT_KDETH_QP_PREFIX & SEND_BTH_QP_KDETH_QP_MASK) <<
 		  SEND_BTH_QP_KDETH_QP_SHIFT);
 
 	write_csr(dd, RCV_BTH_QP,
-		  (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK) <<
+		  (RVT_KDETH_QP_PREFIX & RCV_BTH_QP_KDETH_QP_MASK) <<
 		  RCV_BTH_QP_KDETH_QP_SHIFT);
 }
 
@@ -14156,6 +14299,12 @@
 	}
 }
 
+/* Is a receive side mapping rule */
+static bool has_rsm_rule(struct hfi1_devdata *dd, u8 rule_index)
+{
+	return read_csr(dd, RCV_RSM_CFG + (8 * rule_index)) != 0;
+}
+
 /*
  * Add a receive side mapping rule.
  */
@@ -14392,77 +14541,138 @@
 	rmt->used += total_cnt;
 }
 
-/* Initialize RSM for VNIC */
-void hfi1_init_vnic_rsm(struct hfi1_devdata *dd)
+static inline bool hfi1_is_rmt_full(int start, int spare)
+{
+	return (start + spare) > NUM_MAP_ENTRIES;
+}
+
+static bool hfi1_netdev_update_rmt(struct hfi1_devdata *dd)
 {
 	u8 i, j;
 	u8 ctx_id = 0;
 	u64 reg;
 	u32 regoff;
-	struct rsm_rule_data rrd;
+	int rmt_start = hfi1_netdev_get_free_rmt_idx(dd);
+	int ctxt_count = hfi1_netdev_ctxt_count(dd);
 
-	if (hfi1_vnic_is_rsm_full(dd, NUM_VNIC_MAP_ENTRIES)) {
-		dd_dev_err(dd, "Vnic RSM disabled, rmt entries used = %d\n",
-			   dd->vnic.rmt_start);
-		return;
+	/* We already have contexts mapped in RMT */
+	if (has_rsm_rule(dd, RSM_INS_VNIC) || has_rsm_rule(dd, RSM_INS_AIP)) {
+		dd_dev_info(dd, "Contexts are already mapped in RMT\n");
+		return true;
 	}
 
-	dev_dbg(&(dd)->pcidev->dev, "Vnic rsm start = %d, end %d\n",
-		dd->vnic.rmt_start,
-		dd->vnic.rmt_start + NUM_VNIC_MAP_ENTRIES);
+	if (hfi1_is_rmt_full(rmt_start, NUM_NETDEV_MAP_ENTRIES)) {
+		dd_dev_err(dd, "Not enough RMT entries used = %d\n",
+			   rmt_start);
+		return false;
+	}
+
+	dev_dbg(&(dd)->pcidev->dev, "RMT start = %d, end %d\n",
+		rmt_start,
+		rmt_start + NUM_NETDEV_MAP_ENTRIES);
 
 	/* Update RSM mapping table, 32 regs, 256 entries - 1 ctx per byte */
-	regoff = RCV_RSM_MAP_TABLE + (dd->vnic.rmt_start / 8) * 8;
+	regoff = RCV_RSM_MAP_TABLE + (rmt_start / 8) * 8;
 	reg = read_csr(dd, regoff);
-	for (i = 0; i < NUM_VNIC_MAP_ENTRIES; i++) {
-		/* Update map register with vnic context */
-		j = (dd->vnic.rmt_start + i) % 8;
+	for (i = 0; i < NUM_NETDEV_MAP_ENTRIES; i++) {
+		/* Update map register with netdev context */
+		j = (rmt_start + i) % 8;
 		reg &= ~(0xffllu << (j * 8));
-		reg |= (u64)dd->vnic.ctxt[ctx_id++]->ctxt << (j * 8);
-		/* Wrap up vnic ctx index */
-		ctx_id %= dd->vnic.num_ctxt;
+		reg |= (u64)hfi1_netdev_get_ctxt(dd, ctx_id++)->ctxt << (j * 8);
+		/* Wrap up netdev ctx index */
+		ctx_id %= ctxt_count;
 		/* Write back map register */
-		if (j == 7 || ((i + 1) == NUM_VNIC_MAP_ENTRIES)) {
+		if (j == 7 || ((i + 1) == NUM_NETDEV_MAP_ENTRIES)) {
 			dev_dbg(&(dd)->pcidev->dev,
-				"Vnic rsm map reg[%d] =0x%llx\n",
+				"RMT[%d] =0x%llx\n",
 				regoff - RCV_RSM_MAP_TABLE, reg);
 
 			write_csr(dd, regoff, reg);
 			regoff += 8;
-			if (i < (NUM_VNIC_MAP_ENTRIES - 1))
+			if (i < (NUM_NETDEV_MAP_ENTRIES - 1))
 				reg = read_csr(dd, regoff);
 		}
 	}
 
-	/* Add rule for vnic */
-	rrd.offset = dd->vnic.rmt_start;
-	rrd.pkt_type = 4;
-	/* Match 16B packets */
-	rrd.field1_off = L2_TYPE_MATCH_OFFSET;
-	rrd.mask1 = L2_TYPE_MASK;
-	rrd.value1 = L2_16B_VALUE;
-	/* Match ETH L4 packets */
-	rrd.field2_off = L4_TYPE_MATCH_OFFSET;
-	rrd.mask2 = L4_16B_TYPE_MASK;
-	rrd.value2 = L4_16B_ETH_VALUE;
-	/* Calc context from veswid and entropy */
-	rrd.index1_off = L4_16B_HDR_VESWID_OFFSET;
-	rrd.index1_width = ilog2(NUM_VNIC_MAP_ENTRIES);
-	rrd.index2_off = L2_16B_ENTROPY_OFFSET;
-	rrd.index2_width = ilog2(NUM_VNIC_MAP_ENTRIES);
-	add_rsm_rule(dd, RSM_INS_VNIC, &rrd);
+	return true;
+}
 
-	/* Enable RSM if not already enabled */
+static void hfi1_enable_rsm_rule(struct hfi1_devdata *dd,
+				 int rule, struct rsm_rule_data *rrd)
+{
+	if (!hfi1_netdev_update_rmt(dd)) {
+		dd_dev_err(dd, "Failed to update RMT for RSM%d rule\n", rule);
+		return;
+	}
+
+	add_rsm_rule(dd, rule, rrd);
 	add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
 }
 
+void hfi1_init_aip_rsm(struct hfi1_devdata *dd)
+{
+	/*
+	 * go through with the initialisation only if this rule actually doesn't
+	 * exist yet
+	 */
+	if (atomic_fetch_inc(&dd->ipoib_rsm_usr_num) == 0) {
+		int rmt_start = hfi1_netdev_get_free_rmt_idx(dd);
+		struct rsm_rule_data rrd = {
+			.offset = rmt_start,
+			.pkt_type = IB_PACKET_TYPE,
+			.field1_off = LRH_BTH_MATCH_OFFSET,
+			.mask1 = LRH_BTH_MASK,
+			.value1 = LRH_BTH_VALUE,
+			.field2_off = BTH_DESTQP_MATCH_OFFSET,
+			.mask2 = BTH_DESTQP_MASK,
+			.value2 = BTH_DESTQP_VALUE,
+			.index1_off = DETH_AIP_SQPN_SELECT_OFFSET +
+					ilog2(NUM_NETDEV_MAP_ENTRIES),
+			.index1_width = ilog2(NUM_NETDEV_MAP_ENTRIES),
+			.index2_off = DETH_AIP_SQPN_SELECT_OFFSET,
+			.index2_width = ilog2(NUM_NETDEV_MAP_ENTRIES)
+		};
+
+		hfi1_enable_rsm_rule(dd, RSM_INS_AIP, &rrd);
+	}
+}
+
+/* Initialize RSM for VNIC */
+void hfi1_init_vnic_rsm(struct hfi1_devdata *dd)
+{
+	int rmt_start = hfi1_netdev_get_free_rmt_idx(dd);
+	struct rsm_rule_data rrd = {
+		/* Add rule for vnic */
+		.offset = rmt_start,
+		.pkt_type = 4,
+		/* Match 16B packets */
+		.field1_off = L2_TYPE_MATCH_OFFSET,
+		.mask1 = L2_TYPE_MASK,
+		.value1 = L2_16B_VALUE,
+		/* Match ETH L4 packets */
+		.field2_off = L4_TYPE_MATCH_OFFSET,
+		.mask2 = L4_16B_TYPE_MASK,
+		.value2 = L4_16B_ETH_VALUE,
+		/* Calc context from veswid and entropy */
+		.index1_off = L4_16B_HDR_VESWID_OFFSET,
+		.index1_width = ilog2(NUM_NETDEV_MAP_ENTRIES),
+		.index2_off = L2_16B_ENTROPY_OFFSET,
+		.index2_width = ilog2(NUM_NETDEV_MAP_ENTRIES)
+	};
+
+	hfi1_enable_rsm_rule(dd, RSM_INS_VNIC, &rrd);
+}
+
 void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd)
 {
 	clear_rsm_rule(dd, RSM_INS_VNIC);
+}
 
-	/* Disable RSM if used only by vnic */
-	if (dd->vnic.rmt_start == 0)
-		clear_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK);
+void hfi1_deinit_aip_rsm(struct hfi1_devdata *dd)
+{
+	/* only actually clear the rule if it's the last user asking to do so */
+	if (atomic_fetch_add_unless(&dd->ipoib_rsm_usr_num, -1, 0) == 1)
+		clear_rsm_rule(dd, RSM_INS_AIP);
 }
 
 static int init_rxe(struct hfi1_devdata *dd)
@@ -14481,8 +14691,8 @@
 	init_qos(dd, rmt);
 	init_fecn_handling(dd, rmt);
 	complete_rsm_map_table(dd, rmt);
-	/* record number of used rsm map entries for vnic */
-	dd->vnic.rmt_start = rmt->used;
+	/* record number of used rsm map entries for netdev */
+	hfi1_netdev_set_free_rmt_idx(dd, rmt->used);
 	kfree(rmt);
 
 	/*
@@ -15036,6 +15246,11 @@
 		 (dd->revision >> CCE_REVISION_SW_SHIFT)
 		    & CCE_REVISION_SW_MASK);
 
+	/* alloc netdev data */
+	ret = hfi1_netdev_alloc(dd);
+	if (ret)
+		goto bail_cleanup;
+
 	ret = set_up_context_variables(dd);
 	if (ret)
 		goto bail_cleanup;
@@ -15136,6 +15351,7 @@
 	hfi1_comp_vectors_clean_up(dd);
 	msix_clean_up_interrupts(dd);
 bail_cleanup:
+	hfi1_netdev_free(dd);
 	hfi1_pcie_ddcleanup(dd);
 bail_free:
 	hfi1_free_devdata(dd);
diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h
index af00619..2c6f2de 100644
--- a/drivers/infiniband/hw/hfi1/chip.h
+++ b/drivers/infiniband/hw/hfi1/chip.h
@@ -1,7 +1,7 @@
 #ifndef _CHIP_H
 #define _CHIP_H
 /*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -358,6 +358,8 @@
 #define MAX_EAGER_BUFFER       (256 * 1024)
 #define MAX_EAGER_BUFFER_TOTAL (64 * (1 << 20)) /* max per ctxt 64MB */
 #define MAX_EXPECTED_BUFFER    (2048 * 1024)
+#define HFI1_MIN_HDRQ_EGRBUF_CNT 32
+#define HFI1_MAX_HDRQ_EGRBUF_CNT 16352
 
 /*
  * Receive expected base and count and eager base and count increment -
@@ -699,6 +701,10 @@
 	return read_csr(dd, RCV_ARRAY_CNT);
 }
 
+u8 encode_rcv_header_entry_size(u8 size);
+int hfi1_validate_rcvhdrcnt(struct hfi1_devdata *dd, uint thecnt);
+void set_hdrq_regs(struct hfi1_devdata *dd, u8 ctxt, u8 entsize, u16 hdrcnt);
+
 u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl,
 	       u32 dw_len);
 
@@ -1441,6 +1447,7 @@
 irqreturn_t sdma_interrupt(int irq, void *data);
 irqreturn_t receive_context_interrupt(int irq, void *data);
 irqreturn_t receive_context_thread(int irq, void *data);
+irqreturn_t receive_context_interrupt_napi(int irq, void *data);
 
 int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set);
 void init_qsfp_int(struct hfi1_devdata *dd);
@@ -1449,6 +1456,8 @@
 void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr);
 void reset_interrupts(struct hfi1_devdata *dd);
 u8 hfi1_get_qp_map(struct hfi1_devdata *dd, u8 idx);
+void hfi1_init_aip_rsm(struct hfi1_devdata *dd);
+void hfi1_deinit_aip_rsm(struct hfi1_devdata *dd);
 
 /*
  * Interrupt source table.
diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h
index d47da7b..ff423e5 100644
--- a/drivers/infiniband/hw/hfi1/common.h
+++ b/drivers/infiniband/hw/hfi1/common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -72,13 +72,6 @@
  * compilation unit
  */
 
-/*
- * If a packet's QP[23:16] bits match this value, then it is
- * a PSM packet and the hardware will expect a KDETH header
- * following the BTH.
- */
-#define DEFAULT_KDETH_QP 0x80
-
 /* driver/hw feature set bitmask */
 #define HFI1_CAP_USER_SHIFT      24
 #define HFI1_CAP_MASK            ((1UL << HFI1_CAP_USER_SHIFT) - 1)
@@ -149,7 +142,8 @@
 				   HFI1_CAP_NO_INTEGRITY |		\
 				   HFI1_CAP_PKEY_CHECK |		\
 				   HFI1_CAP_TID_RDMA |			\
-				   HFI1_CAP_OPFN) <<			\
+				   HFI1_CAP_OPFN |			\
+				   HFI1_CAP_AIP) <<			\
 				  HFI1_CAP_USER_SHIFT)
 /*
  * Set of capabilities that need to be enabled for kernel context in
@@ -166,6 +160,7 @@
 				 HFI1_CAP_PKEY_CHECK |			\
 				 HFI1_CAP_MULTI_PKT_EGR |		\
 				 HFI1_CAP_EXTENDED_PSN |		\
+				 HFI1_CAP_AIP |				\
 				 ((HFI1_CAP_HDRSUPP |			\
 				   HFI1_CAP_MULTI_PKT_EGR |		\
 				   HFI1_CAP_STATIC_RATE_CTRL |		\
@@ -323,6 +318,9 @@
 /* RHF receive type error - bypass packet errors */
 #define RHF_RTE_BYPASS_NO_ERR		0x0
 
+/* MAX RcvSEQ */
+#define RHF_MAX_SEQ 13
+
 /* IB - LRH header constants */
 #define HFI1_LRH_GRH 0x0003      /* 1. word of IB LRH - next header: GRH */
 #define HFI1_LRH_BTH 0x0002      /* 1. word of IB LRH - next header: BTH */
diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c
index c29da2f..2ced236 100644
--- a/drivers/infiniband/hw/hfi1/debugfs.c
+++ b/drivers/infiniband/hw/hfi1/debugfs.c
@@ -379,7 +379,7 @@
 	struct hfi1_devdata *dd = dd_from_dev(ibd);
 
 	++*pos;
-	if (!dd->rcd || *pos >= dd->n_krcv_queues)
+	if (!dd->rcd || *pos >= dd->num_rcv_contexts)
 		return NULL;
 	return pos;
 }
diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c
index 941b465..8085718 100644
--- a/drivers/infiniband/hw/hfi1/driver.c
+++ b/drivers/infiniband/hw/hfi1/driver.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015-2018 Intel Corporation.
+ * Copyright(c) 2015-2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -54,6 +54,7 @@
 #include <linux/module.h>
 #include <linux/prefetch.h>
 #include <rdma/ib_verbs.h>
+#include <linux/etherdevice.h>
 
 #include "hfi.h"
 #include "trace.h"
@@ -63,6 +64,9 @@
 #include "vnic.h"
 #include "fault.h"
 
+#include "ipoib.h"
+#include "netdev.h"
+
 #undef pr_fmt
 #define pr_fmt(fmt) DRIVER_NAME ": " fmt
 
@@ -411,14 +415,14 @@
 static inline void init_packet(struct hfi1_ctxtdata *rcd,
 			       struct hfi1_packet *packet)
 {
-	packet->rsize = rcd->rcvhdrqentsize; /* words */
-	packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */
+	packet->rsize = get_hdrqentsize(rcd); /* words */
+	packet->maxcnt = get_hdrq_cnt(rcd) * packet->rsize; /* words */
 	packet->rcd = rcd;
 	packet->updegr = 0;
 	packet->etail = -1;
 	packet->rhf_addr = get_rhf_addr(rcd);
 	packet->rhf = rhf_to_cpu(packet->rhf_addr);
-	packet->rhqoff = rcd->head;
+	packet->rhqoff = hfi1_rcd_head(rcd);
 	packet->numpkt = 0;
 }
 
@@ -551,22 +555,22 @@
 	mdata->maxcnt = packet->maxcnt;
 	mdata->ps_head = packet->rhqoff;
 
-	if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
+	if (get_dma_rtail_setting(rcd)) {
 		mdata->ps_tail = get_rcvhdrtail(rcd);
 		if (rcd->ctxt == HFI1_CTRL_CTXT)
-			mdata->ps_seq = rcd->seq_cnt;
+			mdata->ps_seq = hfi1_seq_cnt(rcd);
 		else
 			mdata->ps_seq = 0; /* not used with DMA_RTAIL */
 	} else {
 		mdata->ps_tail = 0; /* used only with DMA_RTAIL*/
-		mdata->ps_seq = rcd->seq_cnt;
+		mdata->ps_seq = hfi1_seq_cnt(rcd);
 	}
 }
 
 static inline int ps_done(struct ps_mdata *mdata, u64 rhf,
 			  struct hfi1_ctxtdata *rcd)
 {
-	if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL))
+	if (get_dma_rtail_setting(rcd))
 		return mdata->ps_head == mdata->ps_tail;
 	return mdata->ps_seq != rhf_rcv_seq(rhf);
 }
@@ -592,11 +596,9 @@
 		mdata->ps_head = 0;
 
 	/* Control context must do seq counting */
-	if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ||
-	    (rcd->ctxt == HFI1_CTRL_CTXT)) {
-		if (++mdata->ps_seq > 13)
-			mdata->ps_seq = 1;
-	}
+	if (!get_dma_rtail_setting(rcd) ||
+	    rcd->ctxt == HFI1_CTRL_CTXT)
+		mdata->ps_seq = hfi1_seq_incr_wrap(mdata->ps_seq);
 }
 
 /*
@@ -750,6 +752,39 @@
 	return ret;
 }
 
+static void process_rcv_packet_napi(struct hfi1_packet *packet)
+{
+	packet->etype = rhf_rcv_type(packet->rhf);
+
+	/* total length */
+	packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */
+	/* retrieve eager buffer details */
+	packet->etail = rhf_egr_index(packet->rhf);
+	packet->ebuf = get_egrbuf(packet->rcd, packet->rhf,
+				  &packet->updegr);
+	/*
+	 * Prefetch the contents of the eager buffer.  It is
+	 * OK to send a negative length to prefetch_range().
+	 * The +2 is the size of the RHF.
+	 */
+	prefetch_range(packet->ebuf,
+		       packet->tlen - ((packet->rcd->rcvhdrqentsize -
+				       (rhf_hdrq_offset(packet->rhf)
+					+ 2)) * 4));
+
+	packet->rcd->rhf_rcv_function_map[packet->etype](packet);
+	packet->numpkt++;
+
+	/* Set up for the next packet */
+	packet->rhqoff += packet->rsize;
+	if (packet->rhqoff >= packet->maxcnt)
+		packet->rhqoff = 0;
+
+	packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff +
+				      packet->rcd->rhf_offset;
+	packet->rhf = rhf_to_cpu(packet->rhf_addr);
+}
+
 static inline int process_rcv_packet(struct hfi1_packet *packet, int thread)
 {
 	int ret;
@@ -770,7 +805,7 @@
 		 * The +2 is the size of the RHF.
 		 */
 		prefetch_range(packet->ebuf,
-			       packet->tlen - ((packet->rcd->rcvhdrqentsize -
+			       packet->tlen - ((get_hdrqentsize(packet->rcd) -
 					       (rhf_hdrq_offset(packet->rhf)
 						+ 2)) * 4));
 	}
@@ -824,22 +859,50 @@
 	 * The only thing we need to do is a final update and call for an
 	 * interrupt
 	 */
-	update_usrhead(packet->rcd, packet->rcd->head, packet->updegr,
+	update_usrhead(packet->rcd, hfi1_rcd_head(packet->rcd), packet->updegr,
 		       packet->etail, rcv_intr_dynamic, packet->numpkt);
 }
 
 /*
+ * handle_receive_interrupt_napi_fp - receive a packet
+ * @rcd: the context
+ * @budget: polling budget
+ *
+ * Called from interrupt handler for receive interrupt.
+ * This is the fast path interrupt handler
+ * when executing napi soft irq environment.
+ */
+int handle_receive_interrupt_napi_fp(struct hfi1_ctxtdata *rcd, int budget)
+{
+	struct hfi1_packet packet;
+
+	init_packet(rcd, &packet);
+	if (last_rcv_seq(rcd, rhf_rcv_seq(packet.rhf)))
+		goto bail;
+
+	while (packet.numpkt < budget) {
+		process_rcv_packet_napi(&packet);
+		if (hfi1_seq_incr(rcd, rhf_rcv_seq(packet.rhf)))
+			break;
+
+		process_rcv_update(0, &packet);
+	}
+	hfi1_set_rcd_head(rcd, packet.rhqoff);
+bail:
+	finish_packet(&packet);
+	return packet.numpkt;
+}
+
+/*
  * Handle receive interrupts when using the no dma rtail option.
  */
 int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread)
 {
-	u32 seq;
 	int last = RCV_PKT_OK;
 	struct hfi1_packet packet;
 
 	init_packet(rcd, &packet);
-	seq = rhf_rcv_seq(packet.rhf);
-	if (seq != rcd->seq_cnt) {
+	if (last_rcv_seq(rcd, rhf_rcv_seq(packet.rhf))) {
 		last = RCV_PKT_DONE;
 		goto bail;
 	}
@@ -848,15 +911,12 @@
 
 	while (last == RCV_PKT_OK) {
 		last = process_rcv_packet(&packet, thread);
-		seq = rhf_rcv_seq(packet.rhf);
-		if (++rcd->seq_cnt > 13)
-			rcd->seq_cnt = 1;
-		if (seq != rcd->seq_cnt)
+		if (hfi1_seq_incr(rcd, rhf_rcv_seq(packet.rhf)))
 			last = RCV_PKT_DONE;
 		process_rcv_update(last, &packet);
 	}
 	process_rcv_qp_work(&packet);
-	rcd->head = packet.rhqoff;
+	hfi1_set_rcd_head(rcd, packet.rhqoff);
 bail:
 	finish_packet(&packet);
 	return last;
@@ -885,15 +945,14 @@
 		process_rcv_update(last, &packet);
 	}
 	process_rcv_qp_work(&packet);
-	rcd->head = packet.rhqoff;
+	hfi1_set_rcd_head(rcd, packet.rhqoff);
 bail:
 	finish_packet(&packet);
 	return last;
 }
 
-static inline void set_nodma_rtail(struct hfi1_devdata *dd, u16 ctxt)
+static void set_all_fastpath(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
 {
-	struct hfi1_ctxtdata *rcd;
 	u16 i;
 
 	/*
@@ -901,50 +960,17 @@
 	 * interrupt handler only for that context. Otherwise, switch
 	 * interrupt handler for all statically allocated kernel contexts.
 	 */
-	if (ctxt >= dd->first_dyn_alloc_ctxt) {
-		rcd = hfi1_rcd_get_by_index_safe(dd, ctxt);
-		if (rcd) {
-			rcd->do_interrupt =
-				&handle_receive_interrupt_nodma_rtail;
-			hfi1_rcd_put(rcd);
-		}
-		return;
-	}
-
-	for (i = HFI1_CTRL_CTXT + 1; i < dd->first_dyn_alloc_ctxt; i++) {
-		rcd = hfi1_rcd_get_by_index(dd, i);
-		if (rcd)
-			rcd->do_interrupt =
-				&handle_receive_interrupt_nodma_rtail;
+	if (rcd->ctxt >= dd->first_dyn_alloc_ctxt && !rcd->is_vnic) {
+		hfi1_rcd_get(rcd);
+		hfi1_set_fast(rcd);
 		hfi1_rcd_put(rcd);
-	}
-}
-
-static inline void set_dma_rtail(struct hfi1_devdata *dd, u16 ctxt)
-{
-	struct hfi1_ctxtdata *rcd;
-	u16 i;
-
-	/*
-	 * For dynamically allocated kernel contexts (like vnic) switch
-	 * interrupt handler only for that context. Otherwise, switch
-	 * interrupt handler for all statically allocated kernel contexts.
-	 */
-	if (ctxt >= dd->first_dyn_alloc_ctxt) {
-		rcd = hfi1_rcd_get_by_index_safe(dd, ctxt);
-		if (rcd) {
-			rcd->do_interrupt =
-				&handle_receive_interrupt_dma_rtail;
-			hfi1_rcd_put(rcd);
-		}
 		return;
 	}
 
-	for (i = HFI1_CTRL_CTXT + 1; i < dd->first_dyn_alloc_ctxt; i++) {
+	for (i = HFI1_CTRL_CTXT + 1; i < dd->num_rcv_contexts; i++) {
 		rcd = hfi1_rcd_get_by_index(dd, i);
-		if (rcd)
-			rcd->do_interrupt =
-				&handle_receive_interrupt_dma_rtail;
+		if (rcd && (i < dd->first_dyn_alloc_ctxt || rcd->is_vnic))
+			hfi1_set_fast(rcd);
 		hfi1_rcd_put(rcd);
 	}
 }
@@ -960,17 +986,14 @@
 		if (!rcd)
 			continue;
 		if (i < dd->first_dyn_alloc_ctxt || rcd->is_vnic)
-			rcd->do_interrupt = &handle_receive_interrupt;
+			rcd->do_interrupt = rcd->slow_handler;
 
 		hfi1_rcd_put(rcd);
 	}
 }
 
-static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd,
-				      struct hfi1_packet *packet,
-				      struct hfi1_devdata *dd)
+static bool __set_armed_to_active(struct hfi1_packet *packet)
 {
-	struct work_struct *lsaw = &rcd->ppd->linkstate_active_work;
 	u8 etype = rhf_rcv_type(packet->rhf);
 	u8 sc = SC15_PACKET;
 
@@ -985,19 +1008,34 @@
 		sc = hfi1_16B_get_sc(hdr);
 	}
 	if (sc != SC15_PACKET) {
-		int hwstate = driver_lstate(rcd->ppd);
+		int hwstate = driver_lstate(packet->rcd->ppd);
+		struct work_struct *lsaw =
+				&packet->rcd->ppd->linkstate_active_work;
 
 		if (hwstate != IB_PORT_ACTIVE) {
-			dd_dev_info(dd,
+			dd_dev_info(packet->rcd->dd,
 				    "Unexpected link state %s\n",
 				    opa_lstate_name(hwstate));
-			return 0;
+			return false;
 		}
 
-		queue_work(rcd->ppd->link_wq, lsaw);
-		return 1;
+		queue_work(packet->rcd->ppd->link_wq, lsaw);
+		return true;
 	}
-	return 0;
+	return false;
+}
+
+/**
+ * armed to active - the fast path for armed to active
+ * @packet: the packet structure
+ *
+ * Return true if packet processing needs to bail.
+ */
+static bool set_armed_to_active(struct hfi1_packet *packet)
+{
+	if (likely(packet->rcd->ppd->host_link_state != HLS_UP_ARMED))
+		return false;
+	return __set_armed_to_active(packet);
 }
 
 /*
@@ -1015,15 +1053,15 @@
 	struct hfi1_packet packet;
 	int skip_pkt = 0;
 
+	if (!rcd->rcvhdrq)
+		return RCV_PKT_OK;
 	/* Control context will always use the slow path interrupt handler */
 	needset = (rcd->ctxt == HFI1_CTRL_CTXT) ? 0 : 1;
 
 	init_packet(rcd, &packet);
 
-	if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
-		u32 seq = rhf_rcv_seq(packet.rhf);
-
-		if (seq != rcd->seq_cnt) {
+	if (!get_dma_rtail_setting(rcd)) {
+		if (last_rcv_seq(rcd, rhf_rcv_seq(packet.rhf))) {
 			last = RCV_PKT_DONE;
 			goto bail;
 		}
@@ -1040,22 +1078,15 @@
 		 * Control context can potentially receive an invalid
 		 * rhf. Drop such packets.
 		 */
-		if (rcd->ctxt == HFI1_CTRL_CTXT) {
-			u32 seq = rhf_rcv_seq(packet.rhf);
-
-			if (seq != rcd->seq_cnt)
+		if (rcd->ctxt == HFI1_CTRL_CTXT)
+			if (last_rcv_seq(rcd, rhf_rcv_seq(packet.rhf)))
 				skip_pkt = 1;
-		}
 	}
 
 	prescan_rxq(rcd, &packet);
 
 	while (last == RCV_PKT_OK) {
-		if (unlikely(dd->do_drop &&
-			     atomic_xchg(&dd->drop_packet, DROP_PACKET_OFF) ==
-			     DROP_PACKET_ON)) {
-			dd->do_drop = 0;
-
+		if (hfi1_need_drop(dd)) {
 			/* On to the next packet */
 			packet.rhqoff += packet.rsize;
 			packet.rhf_addr = (__le32 *)rcd->rcvhdrq +
@@ -1067,26 +1098,14 @@
 			last = skip_rcv_packet(&packet, thread);
 			skip_pkt = 0;
 		} else {
-			/* Auto activate link on non-SC15 packet receive */
-			if (unlikely(rcd->ppd->host_link_state ==
-				     HLS_UP_ARMED) &&
-			    set_armed_to_active(rcd, &packet, dd))
+			if (set_armed_to_active(&packet))
 				goto bail;
 			last = process_rcv_packet(&packet, thread);
 		}
 
-		if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) {
-			u32 seq = rhf_rcv_seq(packet.rhf);
-
-			if (++rcd->seq_cnt > 13)
-				rcd->seq_cnt = 1;
-			if (seq != rcd->seq_cnt)
+		if (!get_dma_rtail_setting(rcd)) {
+			if (hfi1_seq_incr(rcd, rhf_rcv_seq(packet.rhf)))
 				last = RCV_PKT_DONE;
-			if (needset) {
-				dd_dev_info(dd, "Switching to NO_DMA_RTAIL\n");
-				set_nodma_rtail(dd, rcd->ctxt);
-				needset = 0;
-			}
 		} else {
 			if (packet.rhqoff == hdrqtail)
 				last = RCV_PKT_DONE;
@@ -1095,27 +1114,24 @@
 			 * rhf. Drop such packets.
 			 */
 			if (rcd->ctxt == HFI1_CTRL_CTXT) {
-				u32 seq = rhf_rcv_seq(packet.rhf);
+				bool lseq;
 
-				if (++rcd->seq_cnt > 13)
-					rcd->seq_cnt = 1;
-				if (!last && (seq != rcd->seq_cnt))
+				lseq = hfi1_seq_incr(rcd,
+						     rhf_rcv_seq(packet.rhf));
+				if (!last && lseq)
 					skip_pkt = 1;
 			}
-
-			if (needset) {
-				dd_dev_info(dd,
-					    "Switching to DMA_RTAIL\n");
-				set_dma_rtail(dd, rcd->ctxt);
-				needset = 0;
-			}
 		}
 
+		if (needset) {
+			needset = false;
+			set_all_fastpath(dd, rcd);
+		}
 		process_rcv_update(last, &packet);
 	}
 
 	process_rcv_qp_work(&packet);
-	rcd->head = packet.rhqoff;
+	hfi1_set_rcd_head(rcd, packet.rhqoff);
 
 bail:
 	/*
@@ -1127,6 +1143,63 @@
 }
 
 /*
+ * handle_receive_interrupt_napi_sp - receive a packet
+ * @rcd: the context
+ * @budget: polling budget
+ *
+ * Called from interrupt handler for errors or receive interrupt.
+ * This is the slow path interrupt handler
+ * when executing napi soft irq environment.
+ */
+int handle_receive_interrupt_napi_sp(struct hfi1_ctxtdata *rcd, int budget)
+{
+	struct hfi1_devdata *dd = rcd->dd;
+	int last = RCV_PKT_OK;
+	bool needset = true;
+	struct hfi1_packet packet;
+
+	init_packet(rcd, &packet);
+	if (last_rcv_seq(rcd, rhf_rcv_seq(packet.rhf)))
+		goto bail;
+
+	while (last != RCV_PKT_DONE && packet.numpkt < budget) {
+		if (hfi1_need_drop(dd)) {
+			/* On to the next packet */
+			packet.rhqoff += packet.rsize;
+			packet.rhf_addr = (__le32 *)rcd->rcvhdrq +
+					  packet.rhqoff +
+					  rcd->rhf_offset;
+			packet.rhf = rhf_to_cpu(packet.rhf_addr);
+
+		} else {
+			if (set_armed_to_active(&packet))
+				goto bail;
+			process_rcv_packet_napi(&packet);
+		}
+
+		if (hfi1_seq_incr(rcd, rhf_rcv_seq(packet.rhf)))
+			last = RCV_PKT_DONE;
+
+		if (needset) {
+			needset = false;
+			set_all_fastpath(dd, rcd);
+		}
+
+		process_rcv_update(last, &packet);
+	}
+
+	hfi1_set_rcd_head(rcd, packet.rhqoff);
+
+bail:
+	/*
+	 * Always write head at end, and setup rcv interrupt, even
+	 * if no packets were processed.
+	 */
+	finish_packet(&packet);
+	return packet.numpkt;
+}
+
+/*
  * We may discover in the interrupt that the hardware link state has
  * changed from ARMED to ACTIVE (due to the arrival of a non-SC15 packet),
  * and we need to update the driver's notion of the link state.  We cannot
@@ -1603,59 +1676,116 @@
 		show_eflags_errs(packet);
 }
 
+static void hfi1_ipoib_ib_rcv(struct hfi1_packet *packet)
+{
+	struct hfi1_ibport *ibp;
+	struct net_device *netdev;
+	struct hfi1_ctxtdata *rcd = packet->rcd;
+	struct napi_struct *napi = rcd->napi;
+	struct sk_buff *skb;
+	struct hfi1_netdev_rxq *rxq = container_of(napi,
+			struct hfi1_netdev_rxq, napi);
+	u32 extra_bytes;
+	u32 tlen, qpnum;
+	bool do_work, do_cnp;
+	struct hfi1_ipoib_dev_priv *priv;
+
+	trace_hfi1_rcvhdr(packet);
+
+	hfi1_setup_ib_header(packet);
+
+	packet->ohdr = &((struct ib_header *)packet->hdr)->u.oth;
+	packet->grh = NULL;
+
+	if (unlikely(rhf_err_flags(packet->rhf))) {
+		handle_eflags(packet);
+		return;
+	}
+
+	qpnum = ib_bth_get_qpn(packet->ohdr);
+	netdev = hfi1_netdev_get_data(rcd->dd, qpnum);
+	if (!netdev)
+		goto drop_no_nd;
+
+	trace_input_ibhdr(rcd->dd, packet, !!(rhf_dc_info(packet->rhf)));
+	trace_ctxt_rsm_hist(rcd->ctxt);
+
+	/* handle congestion notifications */
+	do_work = hfi1_may_ecn(packet);
+	if (unlikely(do_work)) {
+		do_cnp = (packet->opcode != IB_OPCODE_CNP);
+		(void)hfi1_process_ecn_slowpath(hfi1_ipoib_priv(netdev)->qp,
+						 packet, do_cnp);
+	}
+
+	/*
+	 * We have split point after last byte of DETH
+	 * lets strip padding and CRC and ICRC.
+	 * tlen is whole packet len so we need to
+	 * subtract header size as well.
+	 */
+	tlen = packet->tlen;
+	extra_bytes = ib_bth_get_pad(packet->ohdr) + (SIZE_OF_CRC << 2) +
+			packet->hlen;
+	if (unlikely(tlen < extra_bytes))
+		goto drop;
+
+	tlen -= extra_bytes;
+
+	skb = hfi1_ipoib_prepare_skb(rxq, tlen, packet->ebuf);
+	if (unlikely(!skb))
+		goto drop;
+
+	priv = hfi1_ipoib_priv(netdev);
+	hfi1_ipoib_update_rx_netstats(priv, 1, skb->len);
+
+	skb->dev = netdev;
+	skb->pkt_type = PACKET_HOST;
+	netif_receive_skb(skb);
+
+	return;
+
+drop:
+	++netdev->stats.rx_dropped;
+drop_no_nd:
+	ibp = rcd_to_iport(packet->rcd);
+	++ibp->rvp.n_pkt_drops;
+}
+
 /*
  * The following functions are called by the interrupt handler. They are type
  * specific handlers for each packet type.
  */
-static int process_receive_ib(struct hfi1_packet *packet)
+static void process_receive_ib(struct hfi1_packet *packet)
 {
 	if (hfi1_setup_9B_packet(packet))
-		return RHF_RCV_CONTINUE;
+		return;
 
 	if (unlikely(hfi1_dbg_should_fault_rx(packet)))
-		return RHF_RCV_CONTINUE;
+		return;
 
 	trace_hfi1_rcvhdr(packet);
 
 	if (unlikely(rhf_err_flags(packet->rhf))) {
 		handle_eflags(packet);
-		return RHF_RCV_CONTINUE;
+		return;
 	}
 
 	hfi1_ib_rcv(packet);
-	return RHF_RCV_CONTINUE;
 }
 
-static inline bool hfi1_is_vnic_packet(struct hfi1_packet *packet)
-{
-	/* Packet received in VNIC context via RSM */
-	if (packet->rcd->is_vnic)
-		return true;
-
-	if ((hfi1_16B_get_l2(packet->ebuf) == OPA_16B_L2_TYPE) &&
-	    (hfi1_16B_get_l4(packet->ebuf) == OPA_16B_L4_ETHR))
-		return true;
-
-	return false;
-}
-
-static int process_receive_bypass(struct hfi1_packet *packet)
+static void process_receive_bypass(struct hfi1_packet *packet)
 {
 	struct hfi1_devdata *dd = packet->rcd->dd;
 
-	if (hfi1_is_vnic_packet(packet)) {
-		hfi1_vnic_bypass_rcv(packet);
-		return RHF_RCV_CONTINUE;
-	}
-
 	if (hfi1_setup_bypass_packet(packet))
-		return RHF_RCV_CONTINUE;
+		return;
 
 	trace_hfi1_rcvhdr(packet);
 
 	if (unlikely(rhf_err_flags(packet->rhf))) {
 		handle_eflags(packet);
-		return RHF_RCV_CONTINUE;
+		return;
 	}
 
 	if (hfi1_16B_get_l2(packet->hdr) == 0x2) {
@@ -1678,17 +1808,16 @@
 				(OPA_EI_STATUS_SMASK | BAD_L2_ERR);
 		}
 	}
-	return RHF_RCV_CONTINUE;
 }
 
-static int process_receive_error(struct hfi1_packet *packet)
+static void process_receive_error(struct hfi1_packet *packet)
 {
 	/* KHdrHCRCErr -- KDETH packet with a bad HCRC */
 	if (unlikely(
 		 hfi1_dbg_fault_suppress_err(&packet->rcd->dd->verbs_dev) &&
 		 (rhf_rcv_type_err(packet->rhf) == RHF_RCV_TYPE_ERROR ||
 		  packet->rhf & RHF_DC_ERR)))
-		return RHF_RCV_CONTINUE;
+		return;
 
 	hfi1_setup_ib_header(packet);
 	handle_eflags(packet);
@@ -1696,32 +1825,29 @@
 	if (unlikely(rhf_err_flags(packet->rhf)))
 		dd_dev_err(packet->rcd->dd,
 			   "Unhandled error packet received. Dropping.\n");
-
-	return RHF_RCV_CONTINUE;
 }
 
-static int kdeth_process_expected(struct hfi1_packet *packet)
+static void kdeth_process_expected(struct hfi1_packet *packet)
 {
 	hfi1_setup_9B_packet(packet);
 	if (unlikely(hfi1_dbg_should_fault_rx(packet)))
-		return RHF_RCV_CONTINUE;
+		return;
 
 	if (unlikely(rhf_err_flags(packet->rhf))) {
 		struct hfi1_ctxtdata *rcd = packet->rcd;
 
 		if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
-			return RHF_RCV_CONTINUE;
+			return;
 	}
 
 	hfi1_kdeth_expected_rcv(packet);
-	return RHF_RCV_CONTINUE;
 }
 
-static int kdeth_process_eager(struct hfi1_packet *packet)
+static void kdeth_process_eager(struct hfi1_packet *packet)
 {
 	hfi1_setup_9B_packet(packet);
 	if (unlikely(hfi1_dbg_should_fault_rx(packet)))
-		return RHF_RCV_CONTINUE;
+		return;
 
 	trace_hfi1_rcvhdr(packet);
 	if (unlikely(rhf_err_flags(packet->rhf))) {
@@ -1729,37 +1855,41 @@
 
 		show_eflags_errs(packet);
 		if (hfi1_handle_kdeth_eflags(rcd, rcd->ppd, packet))
-			return RHF_RCV_CONTINUE;
+			return;
 	}
 
 	hfi1_kdeth_eager_rcv(packet);
-	return RHF_RCV_CONTINUE;
 }
 
-static int process_receive_invalid(struct hfi1_packet *packet)
+static void process_receive_invalid(struct hfi1_packet *packet)
 {
 	dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n",
 		   rhf_rcv_type(packet->rhf));
-	return RHF_RCV_CONTINUE;
 }
 
+#define HFI1_RCVHDR_DUMP_MAX	5
+
 void seqfile_dump_rcd(struct seq_file *s, struct hfi1_ctxtdata *rcd)
 {
 	struct hfi1_packet packet;
 	struct ps_mdata mdata;
+	int i;
 
-	seq_printf(s, "Rcd %u: RcvHdr cnt %u entsize %u %s head %llu tail %llu\n",
-		   rcd->ctxt, rcd->rcvhdrq_cnt, rcd->rcvhdrqentsize,
-		   HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) ?
+	seq_printf(s, "Rcd %u: RcvHdr cnt %u entsize %u %s ctrl 0x%08llx status 0x%08llx, head %llu tail %llu  sw head %u\n",
+		   rcd->ctxt, get_hdrq_cnt(rcd), get_hdrqentsize(rcd),
+		   get_dma_rtail_setting(rcd) ?
 		   "dma_rtail" : "nodma_rtail",
+		   read_kctxt_csr(rcd->dd, rcd->ctxt, RCV_CTXT_CTRL),
+		   read_kctxt_csr(rcd->dd, rcd->ctxt, RCV_CTXT_STATUS),
 		   read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD) &
 		   RCV_HDR_HEAD_HEAD_MASK,
-		   read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL));
+		   read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL),
+		   rcd->head);
 
 	init_packet(rcd, &packet);
 	init_ps_mdata(&mdata, &packet);
 
-	while (1) {
+	for (i = 0; i < HFI1_RCVHDR_DUMP_MAX; i++) {
 		__le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head +
 					 rcd->rhf_offset;
 		struct ib_header *hdr;
@@ -1811,3 +1941,14 @@
 	[RHF_RCV_TYPE_INVALID6] = process_receive_invalid,
 	[RHF_RCV_TYPE_INVALID7] = process_receive_invalid,
 };
+
+const rhf_rcv_function_ptr netdev_rhf_rcv_functions[] = {
+	[RHF_RCV_TYPE_EXPECTED] = process_receive_invalid,
+	[RHF_RCV_TYPE_EAGER] = process_receive_invalid,
+	[RHF_RCV_TYPE_IB] = hfi1_ipoib_ib_rcv,
+	[RHF_RCV_TYPE_ERROR] = process_receive_error,
+	[RHF_RCV_TYPE_BYPASS] = hfi1_vnic_bypass_rcv,
+	[RHF_RCV_TYPE_INVALID5] = process_receive_invalid,
+	[RHF_RCV_TYPE_INVALID6] = process_receive_invalid,
+	[RHF_RCV_TYPE_INVALID7] = process_receive_invalid,
+};
diff --git a/drivers/infiniband/hw/hfi1/efivar.c b/drivers/infiniband/hw/hfi1/efivar.c
index d106d23..c22ab7b 100644
--- a/drivers/infiniband/hw/hfi1/efivar.c
+++ b/drivers/infiniband/hw/hfi1/efivar.c
@@ -78,7 +78,7 @@
 	*size = 0;
 	*return_data = NULL;
 
-	if (!efi_enabled(EFI_RUNTIME_SERVICES))
+	if (!efi_rt_services_supported(EFI_RT_SUPPORTED_GET_VARIABLE))
 		return -EOPNOTSUPP;
 
 	uni_name = kcalloc(strlen(name) + 1, sizeof(efi_char16_t), GFP_KERNEL);
diff --git a/drivers/infiniband/hw/hfi1/fault.c b/drivers/infiniband/hw/hfi1/fault.c
index 986c121..0dfbcfb 100644
--- a/drivers/infiniband/hw/hfi1/fault.c
+++ b/drivers/infiniband/hw/hfi1/fault.c
@@ -222,11 +222,11 @@
 	while (bit < bitsize) {
 		zero = find_next_zero_bit(fault->opcodes, bitsize, bit);
 		if (zero - 1 != bit)
-			size += snprintf(data + size,
+			size += scnprintf(data + size,
 					 datalen - size - 1,
 					 "0x%lx-0x%lx,", bit, zero - 1);
 		else
-			size += snprintf(data + size,
+			size += scnprintf(data + size,
 					 datalen - size - 1, "0x%lx,",
 					 bit);
 		bit = find_next_bit(fault->opcodes, bitsize, zero);
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c
index 89e1dfd..329ee4f 100644
--- a/drivers/infiniband/hw/hfi1/file_ops.c
+++ b/drivers/infiniband/hw/hfi1/file_ops.c
@@ -1,5 +1,6 @@
 /*
- * Copyright(c) 2015-2017 Intel Corporation.
+ * Copyright(c) 2020 Cornelis Networks, Inc.
+ * Copyright(c) 2015-2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -206,10 +207,7 @@
 	spin_lock_init(&fd->tid_lock);
 	spin_lock_init(&fd->invalid_lock);
 	fd->rec_cpu_num = -1; /* no cpu affinity by default */
-	fd->mm = current->mm;
-	mmgrab(fd->mm);
 	fd->dd = dd;
-	kobject_get(&fd->dd->kobj);
 	fp->private_data = fd;
 	return 0;
 nomem:
@@ -516,12 +514,12 @@
 			ret = -EINVAL;
 			goto done;
 		}
-		if ((flags & VM_WRITE) || !uctxt->rcvhdrtail_kvaddr) {
+		if ((flags & VM_WRITE) || !hfi1_rcvhdrtail_kvaddr(uctxt)) {
 			ret = -EPERM;
 			goto done;
 		}
 		memlen = PAGE_SIZE;
-		memvirt = (void *)uctxt->rcvhdrtail_kvaddr;
+		memvirt = (void *)hfi1_rcvhdrtail_kvaddr(uctxt);
 		flags &= ~VM_MAYWRITE;
 		break;
 	case SUBCTXT_UREGS:
@@ -712,8 +710,6 @@
 
 	deallocate_ctxt(uctxt);
 done:
-	mmdrop(fdata->mm);
-	kobject_put(&dd->kobj);
 
 	if (atomic_dec_and_test(&dd->user_refcount))
 		complete(&dd->user_comp);
@@ -1102,7 +1098,7 @@
 	 * don't have to wait to be sure the DMA update has happened
 	 * (chip resets head/tail to 0 on transition to enable).
 	 */
-	if (uctxt->rcvhdrtail_kvaddr)
+	if (hfi1_rcvhdrtail_kvaddr(uctxt))
 		clear_rcvhdrtail(uctxt);
 
 	/* Setup J_KEY before enabling the context */
@@ -1150,7 +1146,7 @@
 			HFI1_CAP_UGET_MASK(uctxt->flags, MASK) |
 			HFI1_CAP_KGET_MASK(uctxt->flags, K2U);
 	/* adjust flag if this fd is not able to cache */
-	if (!fd->handler)
+	if (!fd->use_mn)
 		cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */
 
 	cinfo.num_active = hfi1_count_active_units();
@@ -1166,8 +1162,8 @@
 	cinfo.send_ctxt = uctxt->sc->hw_context;
 
 	cinfo.egrtids = uctxt->egrbufs.alloced;
-	cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
-	cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2;
+	cinfo.rcvhdrq_cnt = get_hdrq_cnt(uctxt);
+	cinfo.rcvhdrq_entsize = get_hdrqentsize(uctxt) << 2;
 	cinfo.sdma_ring_size = fd->cq->nentries;
 	cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
 
@@ -1266,7 +1262,7 @@
 	memset(&binfo, 0, sizeof(binfo));
 	binfo.hw_version = dd->revision;
 	binfo.sw_version = HFI1_KERN_SWVERSION;
-	binfo.bthqp = kdeth_qp;
+	binfo.bthqp = RVT_KDETH_QP_PREFIX;
 	binfo.jkey = uctxt->jkey;
 	/*
 	 * If more than 64 contexts are enabled the allocated credit
@@ -1555,7 +1551,7 @@
 		 * always resets it's tail register back to 0 on a
 		 * transition from disabled to enabled.
 		 */
-		if (uctxt->rcvhdrtail_kvaddr)
+		if (hfi1_rcvhdrtail_kvaddr(uctxt))
 			clear_rcvhdrtail(uctxt);
 		rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
 	} else {
@@ -1696,7 +1692,7 @@
 	snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
 	ret = hfi1_cdev_init(dd->unit, name, &hfi1_file_ops,
 			     &dd->user_cdev, &dd->user_device,
-			     true, &dd->kobj);
+			     true, &dd->verbs_dev.rdi.ibdev.dev.kobj);
 	if (ret)
 		user_remove(dd);
 
diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c
index c090807..2cf102b 100644
--- a/drivers/infiniband/hw/hfi1/firmware.c
+++ b/drivers/infiniband/hw/hfi1/firmware.c
@@ -1868,11 +1868,8 @@
 									2;
 				break;
 			case PLATFORM_CONFIG_RX_PRESET_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_TX_PRESET_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
 				pcfgcache->config_tables[table_type].num_table =
 							table_length_dwords;
@@ -1890,15 +1887,10 @@
 			/* metadata table */
 			switch (table_type) {
 			case PLATFORM_CONFIG_SYSTEM_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_PORT_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_RX_PRESET_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_TX_PRESET_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-				/* fall through */
 			case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
 				break;
 			default:
@@ -2028,15 +2020,10 @@
 
 	switch (table) {
 	case PLATFORM_CONFIG_SYSTEM_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_PORT_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_RX_PRESET_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_TX_PRESET_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
 		if (field && field < platform_config_table_limits[table])
 			src_ptr =
@@ -2139,11 +2126,8 @@
 			pcfgcache->config_tables[table_type].table;
 		break;
 	case PLATFORM_CONFIG_RX_PRESET_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_TX_PRESET_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_QSFP_ATTEN_TABLE:
-		/* fall through */
 	case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE:
 		src_ptr = pcfgcache->config_tables[table_type].table;
 
diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
index b79931c..2a9a040 100644
--- a/drivers/infiniband/hw/hfi1/hfi.h
+++ b/drivers/infiniband/hw/hfi1/hfi.h
@@ -1,7 +1,8 @@
 #ifndef _HFI1_KERNEL_H
 #define _HFI1_KERNEL_H
 /*
- * Copyright(c) 2015-2018 Intel Corporation.
+ * Copyright(c) 2020 Cornelis Networks, Inc.
+ * Copyright(c) 2015-2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -197,7 +198,9 @@
 	u32 count;
 };
 
-typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
+struct hfi1_ctxtdata;
+typedef int (*intr_handler)(struct hfi1_ctxtdata *rcd, int data);
+typedef void (*rhf_rcv_function_ptr)(struct hfi1_packet *packet);
 
 struct tid_queue {
 	struct list_head queue_head;
@@ -226,7 +229,13 @@
 	 * be valid. Worst case is we process an extra interrupt and up to 64
 	 * packets with the wrong interrupt handler.
 	 */
-	int (*do_interrupt)(struct hfi1_ctxtdata *rcd, int threaded);
+	intr_handler do_interrupt;
+	/** fast handler after autoactive */
+	intr_handler fast_handler;
+	/** slow handler */
+	intr_handler slow_handler;
+	/* napi pointer assiociated with netdev */
+	struct napi_struct *napi;
 	/* verbs rx_stats per rcd */
 	struct hfi1_opcode_stats_perctx *opstats;
 	/* clear interrupt mask */
@@ -377,11 +386,11 @@
 	u32 rhqoff;
 	u32 dlid;
 	u32 slid;
+	int numpkt;
 	u16 tlen;
 	s16 etail;
 	u16 pkey;
 	u8 hlen;
-	u8 numpkt;
 	u8 rsize;
 	u8 updegr;
 	u8 etype;
@@ -979,7 +988,7 @@
 			      struct hfi1_pkt_state *ps,
 			      struct rvt_swqe *wqe);
 extern const rhf_rcv_function_ptr normal_rhf_rcv_functions[];
-
+extern const rhf_rcv_function_ptr netdev_rhf_rcv_functions[];
 
 /* return values for the RHF receive functions */
 #define RHF_RCV_CONTINUE  0	/* keep going */
@@ -1039,23 +1048,10 @@
 #define NUM_MAP_ENTRIES	 256
 #define NUM_MAP_REGS      32
 
-/*
- * Number of VNIC contexts used. Ensure it is less than or equal to
- * max queues supported by VNIC (HFI1_VNIC_MAX_QUEUE).
- */
-#define HFI1_NUM_VNIC_CTXT   8
-
-/* Number of VNIC RSM entries */
-#define NUM_VNIC_MAP_ENTRIES 8
-
 /* Virtual NIC information */
 struct hfi1_vnic_data {
-	struct hfi1_ctxtdata *ctxt[HFI1_NUM_VNIC_CTXT];
 	struct kmem_cache *txreq_cache;
-	struct xarray vesws;
 	u8 num_vports;
-	u8 rmt_start;
-	u8 num_ctxt;
 };
 
 struct hfi1_vnic_vport_info;
@@ -1161,8 +1157,8 @@
 	u64 z_send_schedule;
 
 	u64 __percpu *send_schedule;
-	/* number of reserved contexts for VNIC usage */
-	u16 num_vnic_contexts;
+	/* number of reserved contexts for netdev usage */
+	u16 num_netdev_contexts;
 	/* number of receive contexts in use by the driver */
 	u32 num_rcv_contexts;
 	/* number of pio send contexts in use by the driver */
@@ -1312,7 +1308,7 @@
 	struct err_info_constraint err_info_xmit_constraint;
 
 	atomic_t drop_packet;
-	u8 do_drop;
+	bool do_drop;
 	u8 err_info_uncorrectable;
 	u8 err_info_fmconfig;
 
@@ -1407,18 +1403,17 @@
 	bool aspm_enabled;	/* ASPM state: enabled/disabled */
 	struct rhashtable *sdma_rht;
 
-	struct kobject kobj;
-
 	/* vnic data */
 	struct hfi1_vnic_data vnic;
 	/* Lock to protect IRQ SRC register access */
 	spinlock_t irq_src_lock;
-};
+	int vnic_num_vports;
+	struct net_device *dummy_netdev;
+	struct hfi1_affinity_node *affinity_entry;
 
-static inline bool hfi1_vnic_is_rsm_full(struct hfi1_devdata *dd, int spare)
-{
-	return (dd->vnic.rmt_start + spare) > NUM_MAP_ENTRIES;
-}
+	/* Keeps track of IPoIB RSM rule users */
+	atomic_t ipoib_rsm_usr_num;
+};
 
 /* 8051 firmware version helper */
 #define dc8051_ver(a, b, c) ((a) << 16 | (b) << 8 | (c))
@@ -1449,7 +1444,7 @@
 	/* for cpu affinity; -1 if none */
 	int rec_cpu_num;
 	u32 tid_n_pinned;
-	struct mmu_rb_handler *handler;
+	bool use_mn;
 	struct tid_rb_node **entry_to_rb;
 	spinlock_t tid_lock; /* protect tid_[limit,used] counters */
 	u32 tid_limit;
@@ -1458,7 +1453,6 @@
 	u32 invalid_tid_idx;
 	/* protect invalid_tids array and invalid_tid_idx */
 	spinlock_t invalid_lock;
-	struct mm_struct *mm;
 };
 
 extern struct xarray hfi1_dev_table;
@@ -1496,6 +1490,8 @@
 int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread);
 int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread);
 int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread);
+int handle_receive_interrupt_napi_fp(struct hfi1_ctxtdata *rcd, int budget);
+int handle_receive_interrupt_napi_sp(struct hfi1_ctxtdata *rcd, int budget);
 void set_all_slowpath(struct hfi1_devdata *dd);
 
 extern const struct pci_device_id hfi1_pci_tbl[];
@@ -1512,12 +1508,148 @@
 #define RCV_PKT_LIMIT   0x1 /* stop, hit limit, start thread */
 #define RCV_PKT_DONE    0x2 /* stop, no more packets detected */
 
+/**
+ * hfi1_rcd_head - add accessor for rcd head
+ * @rcd: the context
+ */
+static inline u32 hfi1_rcd_head(struct hfi1_ctxtdata *rcd)
+{
+	return rcd->head;
+}
+
+/**
+ * hfi1_set_rcd_head - add accessor for rcd head
+ * @rcd: the context
+ * @head: the new head
+ */
+static inline void hfi1_set_rcd_head(struct hfi1_ctxtdata *rcd, u32 head)
+{
+	rcd->head = head;
+}
+
 /* calculate the current RHF address */
 static inline __le32 *get_rhf_addr(struct hfi1_ctxtdata *rcd)
 {
 	return (__le32 *)rcd->rcvhdrq + rcd->head + rcd->rhf_offset;
 }
 
+/* return DMA_RTAIL configuration */
+static inline bool get_dma_rtail_setting(struct hfi1_ctxtdata *rcd)
+{
+	return !!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL);
+}
+
+/**
+ * hfi1_seq_incr_wrap - wrapping increment for sequence
+ * @seq: the current sequence number
+ *
+ * Returns: the incremented seq
+ */
+static inline u8 hfi1_seq_incr_wrap(u8 seq)
+{
+	if (++seq > RHF_MAX_SEQ)
+		seq = 1;
+	return seq;
+}
+
+/**
+ * hfi1_seq_cnt - return seq_cnt member
+ * @rcd: the receive context
+ *
+ * Return seq_cnt member
+ */
+static inline u8 hfi1_seq_cnt(struct hfi1_ctxtdata *rcd)
+{
+	return rcd->seq_cnt;
+}
+
+/**
+ * hfi1_set_seq_cnt - return seq_cnt member
+ * @rcd: the receive context
+ *
+ * Return seq_cnt member
+ */
+static inline void hfi1_set_seq_cnt(struct hfi1_ctxtdata *rcd, u8 cnt)
+{
+	rcd->seq_cnt = cnt;
+}
+
+/**
+ * last_rcv_seq - is last
+ * @rcd: the receive context
+ * @seq: sequence
+ *
+ * return true if last packet
+ */
+static inline bool last_rcv_seq(struct hfi1_ctxtdata *rcd, u32 seq)
+{
+	return seq != rcd->seq_cnt;
+}
+
+/**
+ * rcd_seq_incr - increment context sequence number
+ * @rcd: the receive context
+ * @seq: the current sequence number
+ *
+ * Returns: true if the this was the last packet
+ */
+static inline bool hfi1_seq_incr(struct hfi1_ctxtdata *rcd, u32 seq)
+{
+	rcd->seq_cnt = hfi1_seq_incr_wrap(rcd->seq_cnt);
+	return last_rcv_seq(rcd, seq);
+}
+
+/**
+ * get_hdrqentsize - return hdrq entry size
+ * @rcd: the receive context
+ */
+static inline u8 get_hdrqentsize(struct hfi1_ctxtdata *rcd)
+{
+	return rcd->rcvhdrqentsize;
+}
+
+/**
+ * get_hdrq_cnt - return hdrq count
+ * @rcd: the receive context
+ */
+static inline u16 get_hdrq_cnt(struct hfi1_ctxtdata *rcd)
+{
+	return rcd->rcvhdrq_cnt;
+}
+
+/**
+ * hfi1_is_slowpath - check if this context is slow path
+ * @rcd: the receive context
+ */
+static inline bool hfi1_is_slowpath(struct hfi1_ctxtdata *rcd)
+{
+	return rcd->do_interrupt == rcd->slow_handler;
+}
+
+/**
+ * hfi1_is_fastpath - check if this context is fast path
+ * @rcd: the receive context
+ */
+static inline bool hfi1_is_fastpath(struct hfi1_ctxtdata *rcd)
+{
+	if (rcd->ctxt == HFI1_CTRL_CTXT)
+		return false;
+
+	return rcd->do_interrupt == rcd->fast_handler;
+}
+
+/**
+ * hfi1_set_fast - change to the fast handler
+ * @rcd: the receive context
+ */
+static inline void hfi1_set_fast(struct hfi1_ctxtdata *rcd)
+{
+	if (unlikely(!rcd))
+		return;
+	if (unlikely(!hfi1_is_fastpath(rcd)))
+		rcd->do_interrupt = rcd->fast_handler;
+}
+
 int hfi1_reset_device(int);
 
 void receive_interrupt_work(struct work_struct *work);
@@ -2020,9 +2152,21 @@
 void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
 			     size_t npages, bool dirty);
 
+/**
+ * hfi1_rcvhdrtail_kvaddr - return tail kvaddr
+ * @rcd - the receive context
+ */
+static inline __le64 *hfi1_rcvhdrtail_kvaddr(const struct hfi1_ctxtdata *rcd)
+{
+	return (__le64 *)rcd->rcvhdrtail_kvaddr;
+}
+
 static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
 {
-	*((u64 *)rcd->rcvhdrtail_kvaddr) = 0ULL;
+	u64 *kv = (u64 *)hfi1_rcvhdrtail_kvaddr(rcd);
+
+	if (kv)
+		*kv = 0ULL;
 }
 
 static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd)
@@ -2031,7 +2175,17 @@
 	 * volatile because it's a DMA target from the chip, routine is
 	 * inlined, and don't want register caching or reordering.
 	 */
-	return (u32)le64_to_cpu(*rcd->rcvhdrtail_kvaddr);
+	return (u32)le64_to_cpu(*hfi1_rcvhdrtail_kvaddr(rcd));
+}
+
+static inline bool hfi1_packet_present(struct hfi1_ctxtdata *rcd)
+{
+	if (likely(!rcd->rcvhdrtail_kvaddr)) {
+		u32 seq = rhf_rcv_seq(rhf_to_cpu(get_rhf_addr(rcd)));
+
+		return !last_rcv_seq(rcd, seq);
+	}
+	return hfi1_rcd_head(rcd) != get_rcvhdrtail(rcd);
 }
 
 /*
@@ -2088,7 +2242,6 @@
 extern unsigned long n_krcvqs;
 extern uint krcvqs[];
 extern int krcvqsset;
-extern uint kdeth_qp;
 extern uint loopback;
 extern uint quick_linkup;
 extern uint rcv_intr_timeout;
@@ -2303,6 +2456,25 @@
 	return dd->pcidev->device == PCI_DEVICE_ID_INTEL1;
 }
 
+/**
+ * hfi1_need_drop - detect need for drop
+ * @dd: - the device
+ *
+ * In some cases, the first packet needs to be dropped.
+ *
+ * Return true is the current packet needs to be dropped and false otherwise.
+ */
+static inline bool hfi1_need_drop(struct hfi1_devdata *dd)
+{
+	if (unlikely(dd->do_drop &&
+		     atomic_xchg(&dd->drop_packet, DROP_PACKET_OFF) ==
+		     DROP_PACKET_ON)) {
+		dd->do_drop = false;
+		return true;
+	}
+	return false;
+}
+
 int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp);
 
 #define DD_DEV_ENTRY(dd)       __string(dev, dev_name(&(dd)->pcidev->dev))
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c
index 1256dbd..fa2cd76 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -69,6 +69,7 @@
 #include "affinity.h"
 #include "vnic.h"
 #include "exp_rcv.h"
+#include "netdev.h"
 
 #undef pr_fmt
 #define pr_fmt(fmt) DRIVER_NAME ": " fmt
@@ -78,8 +79,6 @@
  */
 #define HFI1_MIN_USER_CTXT_BUFCNT 7
 
-#define HFI1_MIN_HDRQ_EGRBUF_CNT 2
-#define HFI1_MAX_HDRQ_EGRBUF_CNT 16352
 #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */
 #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */
 
@@ -122,8 +121,6 @@
 module_param(user_credit_return_threshold, uint, S_IRUGO);
 MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)");
 
-static inline u64 encode_rcv_header_entry_size(u16 size);
-
 DEFINE_XARRAY_FLAGS(hfi1_dev_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
 
 static int hfi1_create_kctxt(struct hfi1_devdata *dd,
@@ -154,7 +151,11 @@
 	/* Control context must use DMA_RTAIL */
 	if (rcd->ctxt == HFI1_CTRL_CTXT)
 		rcd->flags |= HFI1_CAP_DMA_RTAIL;
-	rcd->seq_cnt = 1;
+	rcd->fast_handler = get_dma_rtail_setting(rcd) ?
+				handle_receive_interrupt_dma_rtail :
+				handle_receive_interrupt_nodma_rtail;
+
+	hfi1_set_seq_cnt(rcd, 1);
 
 	rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node);
 	if (!rcd->sc) {
@@ -373,6 +374,9 @@
 		rcd->numa_id = numa;
 		rcd->rcv_array_groups = dd->rcv_entries.ngroups;
 		rcd->rhf_rcv_function_map = normal_rhf_rcv_functions;
+		rcd->slow_handler = handle_receive_interrupt;
+		rcd->do_interrupt = rcd->slow_handler;
+		rcd->msix_intr = CCE_NUM_MSIX_VECTORS;
 
 		mutex_init(&rcd->exp_mutex);
 		spin_lock_init(&rcd->exp_lock);
@@ -511,23 +515,6 @@
 }
 
 /*
- * Convert a receive header entry size that to the encoding used in the CSR.
- *
- * Return a zero if the given size is invalid.
- */
-static inline u64 encode_rcv_header_entry_size(u16 size)
-{
-	/* there are only 3 valid receive header entry sizes */
-	if (size == 2)
-		return 1;
-	if (size == 16)
-		return 2;
-	else if (size == 32)
-		return 4;
-	return 0; /* invalid */
-}
-
-/*
  * Select the largest ccti value over all SLs to determine the intra-
  * packet gap for the link.
  *
@@ -910,10 +897,10 @@
 
 	if (is_ax(dd)) {
 		atomic_set(&dd->drop_packet, DROP_PACKET_ON);
-		dd->do_drop = 1;
+		dd->do_drop = true;
 	} else {
 		atomic_set(&dd->drop_packet, DROP_PACKET_OFF);
-		dd->do_drop = 0;
+		dd->do_drop = false;
 	}
 
 	/* make sure the link is not "up" */
@@ -929,18 +916,6 @@
 	if (ret)
 		goto done;
 
-	/* allocate dummy tail memory for all receive contexts */
-	dd->rcvhdrtail_dummy_kvaddr = dma_alloc_coherent(&dd->pcidev->dev,
-							 sizeof(u64),
-							 &dd->rcvhdrtail_dummy_dma,
-							 GFP_KERNEL);
-
-	if (!dd->rcvhdrtail_dummy_kvaddr) {
-		dd_dev_err(dd, "cannot allocate dummy tail memory\n");
-		ret = -ENOMEM;
-		goto done;
-	}
-
 	/* dd->rcd can be NULL if early initialization failed */
 	for (i = 0; dd->rcd && i < dd->first_dyn_alloc_ctxt; ++i) {
 		/*
@@ -953,8 +928,6 @@
 		if (!rcd)
 			continue;
 
-		rcd->do_interrupt = &handle_receive_interrupt;
-
 		lastfail = hfi1_create_rcvhdrq(dd, rcd);
 		if (!lastfail)
 			lastfail = hfi1_setup_eagerbufs(rcd);
@@ -1162,9 +1135,9 @@
 		dma_free_coherent(&dd->pcidev->dev, rcvhdrq_size(rcd),
 				  rcd->rcvhdrq, rcd->rcvhdrq_dma);
 		rcd->rcvhdrq = NULL;
-		if (rcd->rcvhdrtail_kvaddr) {
+		if (hfi1_rcvhdrtail_kvaddr(rcd)) {
 			dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
-					  (void *)rcd->rcvhdrtail_kvaddr,
+					  (void *)hfi1_rcvhdrtail_kvaddr(rcd),
 					  rcd->rcvhdrqtailaddr_dma);
 			rcd->rcvhdrtail_kvaddr = NULL;
 		}
@@ -1175,7 +1148,7 @@
 	rcd->egrbufs.rcvtids = NULL;
 
 	for (e = 0; e < rcd->egrbufs.alloced; e++) {
-		if (rcd->egrbufs.buffers[e].dma)
+		if (rcd->egrbufs.buffers[e].addr)
 			dma_free_coherent(&dd->pcidev->dev,
 					  rcd->egrbufs.buffers[e].len,
 					  rcd->egrbufs.buffers[e].addr,
@@ -1227,13 +1200,13 @@
 }
 
 /**
- * hfi1_clean_devdata - cleans up per-unit data structure
+ * hfi1_free_devdata - cleans up and frees per-unit data structure
  * @dd: pointer to a valid devdata structure
  *
- * It cleans up all data structures set up by
+ * It cleans up and frees all data structures set up by
  * by hfi1_alloc_devdata().
  */
-static void hfi1_clean_devdata(struct hfi1_devdata *dd)
+void hfi1_free_devdata(struct hfi1_devdata *dd)
 {
 	struct hfi1_asic_data *ad;
 	unsigned long flags;
@@ -1256,27 +1229,15 @@
 	dd->tx_opstats    = NULL;
 	kfree(dd->comp_vect);
 	dd->comp_vect = NULL;
+	if (dd->rcvhdrtail_dummy_kvaddr)
+		dma_free_coherent(&dd->pcidev->dev, sizeof(u64),
+				  (void *)dd->rcvhdrtail_dummy_kvaddr,
+				  dd->rcvhdrtail_dummy_dma);
+	dd->rcvhdrtail_dummy_kvaddr = NULL;
 	sdma_clean(dd, dd->num_sdma);
 	rvt_dealloc_device(&dd->verbs_dev.rdi);
 }
 
-static void __hfi1_free_devdata(struct kobject *kobj)
-{
-	struct hfi1_devdata *dd =
-		container_of(kobj, struct hfi1_devdata, kobj);
-
-	hfi1_clean_devdata(dd);
-}
-
-static struct kobj_type hfi1_devdata_type = {
-	.release = __hfi1_free_devdata,
-};
-
-void hfi1_free_devdata(struct hfi1_devdata *dd)
-{
-	kobject_put(&dd->kobj);
-}
-
 /**
  * hfi1_alloc_devdata - Allocate our primary per-unit data structure.
  * @pdev: Valid PCI device
@@ -1303,7 +1264,6 @@
 	dd->pport = (struct hfi1_pportdata *)(dd + 1);
 	dd->pcidev = pdev;
 	pci_set_drvdata(pdev, dd);
-	dd->node = NUMA_NO_NODE;
 
 	ret = xa_alloc_irq(&hfi1_dev_table, &dd->unit, dd, xa_limit_32b,
 			GFP_KERNEL);
@@ -1313,6 +1273,15 @@
 		goto bail;
 	}
 	rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit);
+	/*
+	 * If the BIOS does not have the NUMA node information set, select
+	 * NUMA 0 so we get consistent performance.
+	 */
+	dd->node = pcibus_to_node(pdev->bus);
+	if (dd->node == NUMA_NO_NODE) {
+		dd_dev_err(dd, "Invalid PCI NUMA node. Performance may be affected\n");
+		dd->node = 0;
+	}
 
 	/*
 	 * Initialize all locks for the device. This needs to be as early as
@@ -1362,11 +1331,20 @@
 		goto bail;
 	}
 
-	kobject_init(&dd->kobj, &hfi1_devdata_type);
+	/* allocate dummy tail memory for all receive contexts */
+	dd->rcvhdrtail_dummy_kvaddr =
+		dma_alloc_coherent(&dd->pcidev->dev, sizeof(u64),
+				   &dd->rcvhdrtail_dummy_dma, GFP_KERNEL);
+	if (!dd->rcvhdrtail_dummy_kvaddr) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	atomic_set(&dd->ipoib_rsm_usr_num, 0);
 	return dd;
 
 bail:
-	hfi1_clean_devdata(dd);
+	hfi1_free_devdata(dd);
 	return ERR_PTR(ret);
 }
 
@@ -1569,13 +1547,6 @@
 
 	free_credit_return(dd);
 
-	if (dd->rcvhdrtail_dummy_kvaddr) {
-		dma_free_coherent(&dd->pcidev->dev, sizeof(u64),
-				  (void *)dd->rcvhdrtail_dummy_kvaddr,
-				  dd->rcvhdrtail_dummy_dma);
-		dd->rcvhdrtail_dummy_kvaddr = NULL;
-	}
-
 	/*
 	 * Free any resources still in use (usually just kernel contexts)
 	 * at unload; we do for ctxtcnt, because that's what we allocate.
@@ -1624,29 +1595,6 @@
 	hfi1_free_devdata(dd);
 }
 
-static int init_validate_rcvhdrcnt(struct hfi1_devdata *dd, uint thecnt)
-{
-	if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) {
-		dd_dev_err(dd, "Receive header queue count too small\n");
-		return -EINVAL;
-	}
-
-	if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) {
-		dd_dev_err(dd,
-			   "Receive header queue count cannot be greater than %u\n",
-			   HFI1_MAX_HDRQ_EGRBUF_CNT);
-		return -EINVAL;
-	}
-
-	if (thecnt % HDRQ_INCREMENT) {
-		dd_dev_err(dd, "Receive header queue count %d must be divisible by %lu\n",
-			   thecnt, HDRQ_INCREMENT);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
 static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	int ret = 0, j, pidx, initfail;
@@ -1674,7 +1622,7 @@
 	}
 
 	/* Validate some global module parameters */
-	ret = init_validate_rcvhdrcnt(dd, rcvhdrcnt);
+	ret = hfi1_validate_rcvhdrcnt(dd, rcvhdrcnt);
 	if (ret)
 		goto bail;
 
@@ -1733,9 +1681,6 @@
 	/* do the generic initialization */
 	initfail = hfi1_init(dd, 0);
 
-	/* setup vnic */
-	hfi1_vnic_setup(dd);
-
 	ret = hfi1_register_ib_device(dd);
 
 	/*
@@ -1774,7 +1719,6 @@
 			hfi1_device_remove(dd);
 		if (!ret)
 			hfi1_unregister_ib_device(dd);
-		hfi1_vnic_cleanup(dd);
 		postinit_cleanup(dd);
 		if (initfail)
 			ret = initfail;
@@ -1819,8 +1763,8 @@
 	/* unregister from IB core */
 	hfi1_unregister_ib_device(dd);
 
-	/* cleanup vnic */
-	hfi1_vnic_cleanup(dd);
+	/* free netdev data */
+	hfi1_netdev_free(dd);
 
 	/*
 	 * Disable the IB link, disable interrupts on the device,
@@ -1856,7 +1800,6 @@
 int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
 {
 	unsigned amt;
-	u64 reg;
 
 	if (!rcd->rcvhdrq) {
 		gfp_t gfp_flags;
@@ -1888,30 +1831,9 @@
 				goto bail_free;
 		}
 	}
-	/*
-	 * These values are per-context:
-	 *	RcvHdrCnt
-	 *	RcvHdrEntSize
-	 *	RcvHdrSize
-	 */
-	reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT)
-			& RCV_HDR_CNT_CNT_MASK)
-		<< RCV_HDR_CNT_CNT_SHIFT;
-	write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg);
-	reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize)
-			& RCV_HDR_ENT_SIZE_ENT_SIZE_MASK)
-		<< RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT;
-	write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg);
-	reg = ((u64)DEFAULT_RCVHDRSIZE & RCV_HDR_SIZE_HDR_SIZE_MASK)
-		<< RCV_HDR_SIZE_HDR_SIZE_SHIFT;
-	write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg);
 
-	/*
-	 * Program dummy tail address for every receive context
-	 * before enabling any receive context
-	 */
-	write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_TAIL_ADDR,
-			dd->rcvhdrtail_dummy_dma);
+	set_hdrq_regs(rcd->dd, rcd->ctxt, rcd->rcvhdrqentsize,
+		      rcd->rcvhdrq_cnt);
 
 	return 0;
 
diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h
index 07847cb..d580aa1 100644
--- a/drivers/infiniband/hw/hfi1/iowait.h
+++ b/drivers/infiniband/hw/hfi1/iowait.h
@@ -399,7 +399,7 @@
  * @wait_head: the wait queue
  *
  * This function is called to insert an iowait struct into a
- * wait queue after a resource (eg, sdma decriptor or pio
+ * wait queue after a resource (eg, sdma descriptor or pio
  * buffer) is run out.
  */
 static inline void iowait_queue(bool pkts_sent, struct iowait *w,
diff --git a/drivers/infiniband/hw/hfi1/ipoib.h b/drivers/infiniband/hw/hfi1/ipoib.h
new file mode 100644
index 0000000..1ee361c
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/ipoib.h
@@ -0,0 +1,178 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2020 Intel Corporation.
+ *
+ */
+
+/*
+ * This file contains HFI1 support for IPOIB functionality
+ */
+
+#ifndef HFI1_IPOIB_H
+#define HFI1_IPOIB_H
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/atomic.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/if_infiniband.h>
+
+#include "hfi.h"
+#include "iowait.h"
+#include "netdev.h"
+
+#include <rdma/ib_verbs.h>
+
+#define HFI1_IPOIB_ENTROPY_SHIFT   24
+
+#define HFI1_IPOIB_TXREQ_NAME_LEN   32
+
+#define HFI1_IPOIB_PSEUDO_LEN 20
+#define HFI1_IPOIB_ENCAP_LEN 4
+
+struct hfi1_ipoib_dev_priv;
+
+union hfi1_ipoib_flow {
+	u16 as_int;
+	struct {
+		u8 tx_queue;
+		u8 sc5;
+	} __attribute__((__packed__));
+};
+
+/**
+ * struct hfi1_ipoib_circ_buf - List of items to be processed
+ * @items: ring of items
+ * @head: ring head
+ * @tail: ring tail
+ * @max_items: max items + 1 that the ring can contain
+ * @producer_lock: producer sync lock
+ * @consumer_lock: consumer sync lock
+ */
+struct ipoib_txreq;
+struct hfi1_ipoib_circ_buf {
+	struct ipoib_txreq **items;
+	unsigned long head;
+	unsigned long tail;
+	unsigned long max_items;
+	spinlock_t producer_lock; /* head sync lock */
+	spinlock_t consumer_lock; /* tail sync lock */
+};
+
+/**
+ * struct hfi1_ipoib_txq - IPOIB per Tx queue information
+ * @priv: private pointer
+ * @sde: sdma engine
+ * @tx_list: tx request list
+ * @sent_txreqs: count of txreqs posted to sdma
+ * @stops: count of stops of queue
+ * @ring_full: ring has been filled
+ * @no_desc: descriptor shortage seen
+ * @flow: tracks when list needs to be flushed for a flow change
+ * @q_idx: ipoib Tx queue index
+ * @pkts_sent: indicator packets have been sent from this queue
+ * @wait: iowait structure
+ * @complete_txreqs: count of txreqs completed by sdma
+ * @napi: pointer to tx napi interface
+ * @tx_ring: ring of ipoib txreqs to be reaped by napi callback
+ */
+struct hfi1_ipoib_txq {
+	struct hfi1_ipoib_dev_priv *priv;
+	struct sdma_engine *sde;
+	struct list_head tx_list;
+	u64 sent_txreqs;
+	atomic_t stops;
+	atomic_t ring_full;
+	atomic_t no_desc;
+	union hfi1_ipoib_flow flow;
+	u8 q_idx;
+	bool pkts_sent;
+	struct iowait wait;
+
+	atomic64_t ____cacheline_aligned_in_smp complete_txreqs;
+	struct napi_struct *napi;
+	struct hfi1_ipoib_circ_buf tx_ring;
+};
+
+struct hfi1_ipoib_dev_priv {
+	struct hfi1_devdata *dd;
+	struct net_device   *netdev;
+	struct ib_device    *device;
+	struct hfi1_ipoib_txq *txqs;
+	struct kmem_cache *txreq_cache;
+	struct napi_struct *tx_napis;
+	u16 pkey;
+	u16 pkey_index;
+	u32 qkey;
+	u8 port_num;
+
+	const struct net_device_ops *netdev_ops;
+	struct rvt_qp *qp;
+	struct pcpu_sw_netstats __percpu *netstats;
+};
+
+/* hfi1 ipoib rdma netdev's private data structure */
+struct hfi1_ipoib_rdma_netdev {
+	struct rdma_netdev rn;  /* keep this first */
+	/* followed by device private data */
+	struct hfi1_ipoib_dev_priv dev_priv;
+};
+
+static inline struct hfi1_ipoib_dev_priv *
+hfi1_ipoib_priv(const struct net_device *dev)
+{
+	return &((struct hfi1_ipoib_rdma_netdev *)netdev_priv(dev))->dev_priv;
+}
+
+static inline void
+hfi1_ipoib_update_rx_netstats(struct hfi1_ipoib_dev_priv *priv,
+			      u64 packets,
+			      u64 bytes)
+{
+	struct pcpu_sw_netstats *netstats = this_cpu_ptr(priv->netstats);
+
+	u64_stats_update_begin(&netstats->syncp);
+	netstats->rx_packets += packets;
+	netstats->rx_bytes += bytes;
+	u64_stats_update_end(&netstats->syncp);
+}
+
+static inline void
+hfi1_ipoib_update_tx_netstats(struct hfi1_ipoib_dev_priv *priv,
+			      u64 packets,
+			      u64 bytes)
+{
+	struct pcpu_sw_netstats *netstats = this_cpu_ptr(priv->netstats);
+
+	u64_stats_update_begin(&netstats->syncp);
+	netstats->tx_packets += packets;
+	netstats->tx_bytes += bytes;
+	u64_stats_update_end(&netstats->syncp);
+}
+
+int hfi1_ipoib_send_dma(struct net_device *dev,
+			struct sk_buff *skb,
+			struct ib_ah *address,
+			u32 dqpn);
+
+int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv);
+void hfi1_ipoib_txreq_deinit(struct hfi1_ipoib_dev_priv *priv);
+
+int hfi1_ipoib_rxq_init(struct net_device *dev);
+void hfi1_ipoib_rxq_deinit(struct net_device *dev);
+
+void hfi1_ipoib_napi_tx_enable(struct net_device *dev);
+void hfi1_ipoib_napi_tx_disable(struct net_device *dev);
+
+struct sk_buff *hfi1_ipoib_prepare_skb(struct hfi1_netdev_rxq *rxq,
+				       int size, void *data);
+
+int hfi1_ipoib_rn_get_params(struct ib_device *device,
+			     u8 port_num,
+			     enum rdma_netdev_t type,
+			     struct rdma_netdev_alloc_params *params);
+
+#endif /* _IPOIB_H */
diff --git a/drivers/infiniband/hw/hfi1/ipoib_main.c b/drivers/infiniband/hw/hfi1/ipoib_main.c
new file mode 100644
index 0000000..22299b0
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/ipoib_main.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2020 Intel Corporation.
+ *
+ */
+
+/*
+ * This file contains HFI1 support for ipoib functionality
+ */
+
+#include "ipoib.h"
+#include "hfi.h"
+
+static u32 qpn_from_mac(u8 *mac_arr)
+{
+	return (u32)mac_arr[1] << 16 | mac_arr[2] << 8 | mac_arr[3];
+}
+
+static int hfi1_ipoib_dev_init(struct net_device *dev)
+{
+	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+	int ret;
+
+	priv->netstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+
+	ret = priv->netdev_ops->ndo_init(dev);
+	if (ret)
+		return ret;
+
+	ret = hfi1_netdev_add_data(priv->dd,
+				   qpn_from_mac(priv->netdev->dev_addr),
+				   dev);
+	if (ret < 0) {
+		priv->netdev_ops->ndo_uninit(dev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void hfi1_ipoib_dev_uninit(struct net_device *dev)
+{
+	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+
+	hfi1_netdev_remove_data(priv->dd, qpn_from_mac(priv->netdev->dev_addr));
+
+	priv->netdev_ops->ndo_uninit(dev);
+}
+
+static int hfi1_ipoib_dev_open(struct net_device *dev)
+{
+	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+	int ret;
+
+	ret = priv->netdev_ops->ndo_open(dev);
+	if (!ret) {
+		struct hfi1_ibport *ibp = to_iport(priv->device,
+						   priv->port_num);
+		struct rvt_qp *qp;
+		u32 qpn = qpn_from_mac(priv->netdev->dev_addr);
+
+		rcu_read_lock();
+		qp = rvt_lookup_qpn(ib_to_rvt(priv->device), &ibp->rvp, qpn);
+		if (!qp) {
+			rcu_read_unlock();
+			priv->netdev_ops->ndo_stop(dev);
+			return -EINVAL;
+		}
+		rvt_get_qp(qp);
+		priv->qp = qp;
+		rcu_read_unlock();
+
+		hfi1_netdev_enable_queues(priv->dd);
+		hfi1_ipoib_napi_tx_enable(dev);
+	}
+
+	return ret;
+}
+
+static int hfi1_ipoib_dev_stop(struct net_device *dev)
+{
+	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+
+	if (!priv->qp)
+		return 0;
+
+	hfi1_ipoib_napi_tx_disable(dev);
+	hfi1_netdev_disable_queues(priv->dd);
+
+	rvt_put_qp(priv->qp);
+	priv->qp = NULL;
+
+	return priv->netdev_ops->ndo_stop(dev);
+}
+
+static void hfi1_ipoib_dev_get_stats64(struct net_device *dev,
+				       struct rtnl_link_stats64 *storage)
+{
+	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+
+	netdev_stats_to_stats64(storage, &dev->stats);
+	dev_fetch_sw_netstats(storage, priv->netstats);
+}
+
+static const struct net_device_ops hfi1_ipoib_netdev_ops = {
+	.ndo_init         = hfi1_ipoib_dev_init,
+	.ndo_uninit       = hfi1_ipoib_dev_uninit,
+	.ndo_open         = hfi1_ipoib_dev_open,
+	.ndo_stop         = hfi1_ipoib_dev_stop,
+	.ndo_get_stats64  = hfi1_ipoib_dev_get_stats64,
+};
+
+static int hfi1_ipoib_send(struct net_device *dev,
+			   struct sk_buff *skb,
+			   struct ib_ah *address,
+			   u32 dqpn)
+{
+	return hfi1_ipoib_send_dma(dev, skb, address, dqpn);
+}
+
+static int hfi1_ipoib_mcast_attach(struct net_device *dev,
+				   struct ib_device *device,
+				   union ib_gid *mgid,
+				   u16 mlid,
+				   int set_qkey,
+				   u32 qkey)
+{
+	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+	u32 qpn = (u32)qpn_from_mac(priv->netdev->dev_addr);
+	struct hfi1_ibport *ibp = to_iport(priv->device, priv->port_num);
+	struct rvt_qp *qp;
+	int ret = -EINVAL;
+
+	rcu_read_lock();
+
+	qp = rvt_lookup_qpn(ib_to_rvt(priv->device), &ibp->rvp, qpn);
+	if (qp) {
+		rvt_get_qp(qp);
+		rcu_read_unlock();
+		if (set_qkey)
+			priv->qkey = qkey;
+
+		/* attach QP to multicast group */
+		ret = ib_attach_mcast(&qp->ibqp, mgid, mlid);
+		rvt_put_qp(qp);
+	} else {
+		rcu_read_unlock();
+	}
+
+	return ret;
+}
+
+static int hfi1_ipoib_mcast_detach(struct net_device *dev,
+				   struct ib_device *device,
+				   union ib_gid *mgid,
+				   u16 mlid)
+{
+	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+	u32 qpn = (u32)qpn_from_mac(priv->netdev->dev_addr);
+	struct hfi1_ibport *ibp = to_iport(priv->device, priv->port_num);
+	struct rvt_qp *qp;
+	int ret = -EINVAL;
+
+	rcu_read_lock();
+
+	qp = rvt_lookup_qpn(ib_to_rvt(priv->device), &ibp->rvp, qpn);
+	if (qp) {
+		rvt_get_qp(qp);
+		rcu_read_unlock();
+		ret = ib_detach_mcast(&qp->ibqp, mgid, mlid);
+		rvt_put_qp(qp);
+	} else {
+		rcu_read_unlock();
+	}
+	return ret;
+}
+
+static void hfi1_ipoib_netdev_dtor(struct net_device *dev)
+{
+	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+
+	hfi1_ipoib_txreq_deinit(priv);
+	hfi1_ipoib_rxq_deinit(priv->netdev);
+
+	free_percpu(priv->netstats);
+}
+
+static void hfi1_ipoib_set_id(struct net_device *dev, int id)
+{
+	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+
+	priv->pkey_index = (u16)id;
+	ib_query_pkey(priv->device,
+		      priv->port_num,
+		      priv->pkey_index,
+		      &priv->pkey);
+}
+
+static int hfi1_ipoib_setup_rn(struct ib_device *device,
+			       u8 port_num,
+			       struct net_device *netdev,
+			       void *param)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(device);
+	struct rdma_netdev *rn = netdev_priv(netdev);
+	struct hfi1_ipoib_dev_priv *priv;
+	int rc;
+
+	rn->send = hfi1_ipoib_send;
+	rn->attach_mcast = hfi1_ipoib_mcast_attach;
+	rn->detach_mcast = hfi1_ipoib_mcast_detach;
+	rn->set_id = hfi1_ipoib_set_id;
+	rn->hca = device;
+	rn->port_num = port_num;
+	rn->mtu = netdev->mtu;
+
+	priv = hfi1_ipoib_priv(netdev);
+	priv->dd = dd;
+	priv->netdev = netdev;
+	priv->device = device;
+	priv->port_num = port_num;
+	priv->netdev_ops = netdev->netdev_ops;
+
+	ib_query_pkey(device, port_num, priv->pkey_index, &priv->pkey);
+
+	rc = hfi1_ipoib_txreq_init(priv);
+	if (rc) {
+		dd_dev_err(dd, "IPoIB netdev TX init - failed(%d)\n", rc);
+		return rc;
+	}
+
+	rc = hfi1_ipoib_rxq_init(netdev);
+	if (rc) {
+		dd_dev_err(dd, "IPoIB netdev RX init - failed(%d)\n", rc);
+		hfi1_ipoib_txreq_deinit(priv);
+		return rc;
+	}
+
+	netdev->netdev_ops = &hfi1_ipoib_netdev_ops;
+
+	netdev->priv_destructor = hfi1_ipoib_netdev_dtor;
+	netdev->needs_free_netdev = true;
+
+	return 0;
+}
+
+int hfi1_ipoib_rn_get_params(struct ib_device *device,
+			     u8 port_num,
+			     enum rdma_netdev_t type,
+			     struct rdma_netdev_alloc_params *params)
+{
+	struct hfi1_devdata *dd = dd_from_ibdev(device);
+
+	if (type != RDMA_NETDEV_IPOIB)
+		return -EOPNOTSUPP;
+
+	if (!HFI1_CAP_IS_KSET(AIP) || !dd->num_netdev_contexts)
+		return -EOPNOTSUPP;
+
+	if (!port_num || port_num > dd->num_pports)
+		return -EINVAL;
+
+	params->sizeof_priv = sizeof(struct hfi1_ipoib_rdma_netdev);
+	params->txqs = dd->num_sdma;
+	params->rxqs = dd->num_netdev_contexts;
+	params->param = NULL;
+	params->initialize_rdma_netdev = hfi1_ipoib_setup_rn;
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/hfi1/ipoib_rx.c b/drivers/infiniband/hw/hfi1/ipoib_rx.c
new file mode 100644
index 0000000..3afa754
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/ipoib_rx.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2020 Intel Corporation.
+ *
+ */
+
+#include "netdev.h"
+#include "ipoib.h"
+
+#define HFI1_IPOIB_SKB_PAD ((NET_SKB_PAD) + (NET_IP_ALIGN))
+
+static void copy_ipoib_buf(struct sk_buff *skb, void *data, int size)
+{
+	void *dst_data;
+
+	skb_checksum_none_assert(skb);
+	skb->protocol = *((__be16 *)data);
+
+	dst_data = skb_put(skb, size);
+	memcpy(dst_data, data, size);
+	skb->mac_header = HFI1_IPOIB_PSEUDO_LEN;
+	skb_pull(skb, HFI1_IPOIB_ENCAP_LEN);
+}
+
+static struct sk_buff *prepare_frag_skb(struct napi_struct *napi, int size)
+{
+	struct sk_buff *skb;
+	int skb_size = SKB_DATA_ALIGN(size + HFI1_IPOIB_SKB_PAD);
+	void *frag;
+
+	skb_size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	skb_size = SKB_DATA_ALIGN(skb_size);
+	frag = napi_alloc_frag(skb_size);
+
+	if (unlikely(!frag))
+		return napi_alloc_skb(napi, size);
+
+	skb = build_skb(frag, skb_size);
+
+	if (unlikely(!skb)) {
+		skb_free_frag(frag);
+		return NULL;
+	}
+
+	skb_reserve(skb, HFI1_IPOIB_SKB_PAD);
+	return skb;
+}
+
+struct sk_buff *hfi1_ipoib_prepare_skb(struct hfi1_netdev_rxq *rxq,
+				       int size, void *data)
+{
+	struct napi_struct *napi = &rxq->napi;
+	int skb_size = size + HFI1_IPOIB_ENCAP_LEN;
+	struct sk_buff *skb;
+
+	/*
+	 * For smaller(4k + skb overhead) allocations we will go using
+	 * napi cache. Otherwise we will try to use napi frag cache.
+	 */
+	if (size <= SKB_WITH_OVERHEAD(PAGE_SIZE))
+		skb = napi_alloc_skb(napi, skb_size);
+	else
+		skb = prepare_frag_skb(napi, skb_size);
+
+	if (unlikely(!skb))
+		return NULL;
+
+	copy_ipoib_buf(skb, data, size);
+
+	return skb;
+}
+
+int hfi1_ipoib_rxq_init(struct net_device *netdev)
+{
+	struct hfi1_ipoib_dev_priv *ipoib_priv = hfi1_ipoib_priv(netdev);
+	struct hfi1_devdata *dd = ipoib_priv->dd;
+	int ret;
+
+	ret = hfi1_netdev_rx_init(dd);
+	if (ret)
+		return ret;
+
+	hfi1_init_aip_rsm(dd);
+
+	return ret;
+}
+
+void hfi1_ipoib_rxq_deinit(struct net_device *netdev)
+{
+	struct hfi1_ipoib_dev_priv *ipoib_priv = hfi1_ipoib_priv(netdev);
+	struct hfi1_devdata *dd = ipoib_priv->dd;
+
+	hfi1_deinit_aip_rsm(dd);
+	hfi1_netdev_rx_destroy(dd);
+}
diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c
new file mode 100644
index 0000000..ab1eeff
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c
@@ -0,0 +1,856 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2020 Intel Corporation.
+ *
+ */
+
+/*
+ * This file contains HFI1 support for IPOIB SDMA functionality
+ */
+
+#include <linux/log2.h>
+#include <linux/circ_buf.h>
+
+#include "sdma.h"
+#include "verbs.h"
+#include "trace_ibhdrs.h"
+#include "ipoib.h"
+
+/* Add a convenience helper */
+#define CIRC_ADD(val, add, size) (((val) + (add)) & ((size) - 1))
+#define CIRC_NEXT(val, size) CIRC_ADD(val, 1, size)
+#define CIRC_PREV(val, size) CIRC_ADD(val, -1, size)
+
+/**
+ * struct ipoib_txreq - IPOIB transmit descriptor
+ * @txreq: sdma transmit request
+ * @sdma_hdr: 9b ib headers
+ * @sdma_status: status returned by sdma engine
+ * @priv: ipoib netdev private data
+ * @txq: txq on which skb was output
+ * @skb: skb to send
+ */
+struct ipoib_txreq {
+	struct sdma_txreq           txreq;
+	struct hfi1_sdma_header     sdma_hdr;
+	int                         sdma_status;
+	struct hfi1_ipoib_dev_priv *priv;
+	struct hfi1_ipoib_txq      *txq;
+	struct sk_buff             *skb;
+};
+
+struct ipoib_txparms {
+	struct hfi1_devdata        *dd;
+	struct rdma_ah_attr        *ah_attr;
+	struct hfi1_ibport         *ibp;
+	struct hfi1_ipoib_txq      *txq;
+	union hfi1_ipoib_flow       flow;
+	u32                         dqpn;
+	u8                          hdr_dwords;
+	u8                          entropy;
+};
+
+static u64 hfi1_ipoib_txreqs(const u64 sent, const u64 completed)
+{
+	return sent - completed;
+}
+
+static u64 hfi1_ipoib_used(struct hfi1_ipoib_txq *txq)
+{
+	return hfi1_ipoib_txreqs(txq->sent_txreqs,
+				 atomic64_read(&txq->complete_txreqs));
+}
+
+static void hfi1_ipoib_stop_txq(struct hfi1_ipoib_txq *txq)
+{
+	if (atomic_inc_return(&txq->stops) == 1)
+		netif_stop_subqueue(txq->priv->netdev, txq->q_idx);
+}
+
+static void hfi1_ipoib_wake_txq(struct hfi1_ipoib_txq *txq)
+{
+	if (atomic_dec_and_test(&txq->stops))
+		netif_wake_subqueue(txq->priv->netdev, txq->q_idx);
+}
+
+static uint hfi1_ipoib_ring_hwat(struct hfi1_ipoib_txq *txq)
+{
+	return min_t(uint, txq->priv->netdev->tx_queue_len,
+		     txq->tx_ring.max_items - 1);
+}
+
+static uint hfi1_ipoib_ring_lwat(struct hfi1_ipoib_txq *txq)
+{
+	return min_t(uint, txq->priv->netdev->tx_queue_len,
+		     txq->tx_ring.max_items) >> 1;
+}
+
+static void hfi1_ipoib_check_queue_depth(struct hfi1_ipoib_txq *txq)
+{
+	++txq->sent_txreqs;
+	if (hfi1_ipoib_used(txq) >= hfi1_ipoib_ring_hwat(txq) &&
+	    !atomic_xchg(&txq->ring_full, 1))
+		hfi1_ipoib_stop_txq(txq);
+}
+
+static void hfi1_ipoib_check_queue_stopped(struct hfi1_ipoib_txq *txq)
+{
+	struct net_device *dev = txq->priv->netdev;
+
+	/* If shutting down just return as queue state is irrelevant */
+	if (unlikely(dev->reg_state != NETREG_REGISTERED))
+		return;
+
+	/*
+	 * When the queue has been drained to less than half full it will be
+	 * restarted.
+	 * The size of the txreq ring is fixed at initialization.
+	 * The tx queue len can be adjusted upward while the interface is
+	 * running.
+	 * The tx queue len can be large enough to overflow the txreq_ring.
+	 * Use the minimum of the current tx_queue_len or the rings max txreqs
+	 * to protect against ring overflow.
+	 */
+	if (hfi1_ipoib_used(txq) < hfi1_ipoib_ring_lwat(txq) &&
+	    atomic_xchg(&txq->ring_full, 0))
+		hfi1_ipoib_wake_txq(txq);
+}
+
+static void hfi1_ipoib_free_tx(struct ipoib_txreq *tx, int budget)
+{
+	struct hfi1_ipoib_dev_priv *priv = tx->priv;
+
+	if (likely(!tx->sdma_status)) {
+		hfi1_ipoib_update_tx_netstats(priv, 1, tx->skb->len);
+	} else {
+		++priv->netdev->stats.tx_errors;
+		dd_dev_warn(priv->dd,
+			    "%s: Status = 0x%x pbc 0x%llx txq = %d sde = %d\n",
+			    __func__, tx->sdma_status,
+			    le64_to_cpu(tx->sdma_hdr.pbc), tx->txq->q_idx,
+			    tx->txq->sde->this_idx);
+	}
+
+	napi_consume_skb(tx->skb, budget);
+	sdma_txclean(priv->dd, &tx->txreq);
+	kmem_cache_free(priv->txreq_cache, tx);
+}
+
+static int hfi1_ipoib_drain_tx_ring(struct hfi1_ipoib_txq *txq, int budget)
+{
+	struct hfi1_ipoib_circ_buf *tx_ring = &txq->tx_ring;
+	unsigned long head;
+	unsigned long tail;
+	unsigned int max_tx;
+	int work_done;
+	int tx_count;
+
+	spin_lock_bh(&tx_ring->consumer_lock);
+
+	/* Read index before reading contents at that index. */
+	head = smp_load_acquire(&tx_ring->head);
+	tail = tx_ring->tail;
+	max_tx = tx_ring->max_items;
+
+	work_done = min_t(int, CIRC_CNT(head, tail, max_tx), budget);
+
+	for (tx_count = work_done; tx_count; tx_count--) {
+		hfi1_ipoib_free_tx(tx_ring->items[tail], budget);
+		tail = CIRC_NEXT(tail, max_tx);
+	}
+
+	atomic64_add(work_done, &txq->complete_txreqs);
+
+	/* Finished freeing tx items so store the tail value. */
+	smp_store_release(&tx_ring->tail, tail);
+
+	spin_unlock_bh(&tx_ring->consumer_lock);
+
+	hfi1_ipoib_check_queue_stopped(txq);
+
+	return work_done;
+}
+
+static int hfi1_ipoib_process_tx_ring(struct napi_struct *napi, int budget)
+{
+	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(napi->dev);
+	struct hfi1_ipoib_txq *txq = &priv->txqs[napi - priv->tx_napis];
+
+	int work_done = hfi1_ipoib_drain_tx_ring(txq, budget);
+
+	if (work_done < budget)
+		napi_complete_done(napi, work_done);
+
+	return work_done;
+}
+
+static void hfi1_ipoib_add_tx(struct ipoib_txreq *tx)
+{
+	struct hfi1_ipoib_circ_buf *tx_ring = &tx->txq->tx_ring;
+	unsigned long head;
+	unsigned long tail;
+	size_t max_tx;
+
+	spin_lock(&tx_ring->producer_lock);
+
+	head = tx_ring->head;
+	tail = READ_ONCE(tx_ring->tail);
+	max_tx = tx_ring->max_items;
+
+	if (likely(CIRC_SPACE(head, tail, max_tx))) {
+		tx_ring->items[head] = tx;
+
+		/* Finish storing txreq before incrementing head. */
+		smp_store_release(&tx_ring->head, CIRC_ADD(head, 1, max_tx));
+		napi_schedule(tx->txq->napi);
+	} else {
+		struct hfi1_ipoib_txq *txq = tx->txq;
+		struct hfi1_ipoib_dev_priv *priv = tx->priv;
+
+		/* Ring was full */
+		hfi1_ipoib_free_tx(tx, 0);
+		atomic64_inc(&txq->complete_txreqs);
+		dd_dev_dbg(priv->dd, "txq %d full.\n", txq->q_idx);
+	}
+
+	spin_unlock(&tx_ring->producer_lock);
+}
+
+static void hfi1_ipoib_sdma_complete(struct sdma_txreq *txreq, int status)
+{
+	struct ipoib_txreq *tx = container_of(txreq, struct ipoib_txreq, txreq);
+
+	tx->sdma_status = status;
+
+	hfi1_ipoib_add_tx(tx);
+}
+
+static int hfi1_ipoib_build_ulp_payload(struct ipoib_txreq *tx,
+					struct ipoib_txparms *txp)
+{
+	struct hfi1_devdata *dd = txp->dd;
+	struct sdma_txreq *txreq = &tx->txreq;
+	struct sk_buff *skb = tx->skb;
+	int ret = 0;
+	int i;
+
+	if (skb_headlen(skb)) {
+		ret = sdma_txadd_kvaddr(dd, txreq, skb->data, skb_headlen(skb));
+		if (unlikely(ret))
+			return ret;
+	}
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		ret = sdma_txadd_page(dd,
+				      txreq,
+				      skb_frag_page(frag),
+				      frag->bv_offset,
+				      skb_frag_size(frag));
+		if (unlikely(ret))
+			break;
+	}
+
+	return ret;
+}
+
+static int hfi1_ipoib_build_tx_desc(struct ipoib_txreq *tx,
+				    struct ipoib_txparms *txp)
+{
+	struct hfi1_devdata *dd = txp->dd;
+	struct sdma_txreq *txreq = &tx->txreq;
+	struct hfi1_sdma_header *sdma_hdr = &tx->sdma_hdr;
+	u16 pkt_bytes =
+		sizeof(sdma_hdr->pbc) + (txp->hdr_dwords << 2) + tx->skb->len;
+	int ret;
+
+	ret = sdma_txinit(txreq, 0, pkt_bytes, hfi1_ipoib_sdma_complete);
+	if (unlikely(ret))
+		return ret;
+
+	/* add pbc + headers */
+	ret = sdma_txadd_kvaddr(dd,
+				txreq,
+				sdma_hdr,
+				sizeof(sdma_hdr->pbc) + (txp->hdr_dwords << 2));
+	if (unlikely(ret))
+		return ret;
+
+	/* add the ulp payload */
+	return hfi1_ipoib_build_ulp_payload(tx, txp);
+}
+
+static void hfi1_ipoib_build_ib_tx_headers(struct ipoib_txreq *tx,
+					   struct ipoib_txparms *txp)
+{
+	struct hfi1_ipoib_dev_priv *priv = tx->priv;
+	struct hfi1_sdma_header *sdma_hdr = &tx->sdma_hdr;
+	struct sk_buff *skb = tx->skb;
+	struct hfi1_pportdata *ppd = ppd_from_ibp(txp->ibp);
+	struct rdma_ah_attr *ah_attr = txp->ah_attr;
+	struct ib_other_headers *ohdr;
+	struct ib_grh *grh;
+	u16 dwords;
+	u16 slid;
+	u16 dlid;
+	u16 lrh0;
+	u32 bth0;
+	u32 sqpn = (u32)(priv->netdev->dev_addr[1] << 16 |
+			 priv->netdev->dev_addr[2] << 8 |
+			 priv->netdev->dev_addr[3]);
+	u16 payload_dwords;
+	u8 pad_cnt;
+
+	pad_cnt = -skb->len & 3;
+
+	/* Includes ICRC */
+	payload_dwords = ((skb->len + pad_cnt) >> 2) + SIZE_OF_CRC;
+
+	/* header size in dwords LRH+BTH+DETH = (8+12+8)/4. */
+	txp->hdr_dwords = 7;
+
+	if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) {
+		grh = &sdma_hdr->hdr.ibh.u.l.grh;
+		txp->hdr_dwords +=
+			hfi1_make_grh(txp->ibp,
+				      grh,
+				      rdma_ah_read_grh(ah_attr),
+				      txp->hdr_dwords - LRH_9B_DWORDS,
+				      payload_dwords);
+		lrh0 = HFI1_LRH_GRH;
+		ohdr = &sdma_hdr->hdr.ibh.u.l.oth;
+	} else {
+		lrh0 = HFI1_LRH_BTH;
+		ohdr = &sdma_hdr->hdr.ibh.u.oth;
+	}
+
+	lrh0 |= (rdma_ah_get_sl(ah_attr) & 0xf) << 4;
+	lrh0 |= (txp->flow.sc5 & 0xf) << 12;
+
+	dlid = opa_get_lid(rdma_ah_get_dlid(ah_attr), 9B);
+	if (dlid == be16_to_cpu(IB_LID_PERMISSIVE)) {
+		slid = be16_to_cpu(IB_LID_PERMISSIVE);
+	} else {
+		u16 lid = (u16)ppd->lid;
+
+		if (lid) {
+			lid |= rdma_ah_get_path_bits(ah_attr) &
+				((1 << ppd->lmc) - 1);
+			slid = lid;
+		} else {
+			slid = be16_to_cpu(IB_LID_PERMISSIVE);
+		}
+	}
+
+	/* Includes ICRC */
+	dwords = txp->hdr_dwords + payload_dwords;
+
+	/* Build the lrh */
+	sdma_hdr->hdr.hdr_type = HFI1_PKT_TYPE_9B;
+	hfi1_make_ib_hdr(&sdma_hdr->hdr.ibh, lrh0, dwords, dlid, slid);
+
+	/* Build the bth */
+	bth0 = (IB_OPCODE_UD_SEND_ONLY << 24) | (pad_cnt << 20) | priv->pkey;
+
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(txp->dqpn);
+	ohdr->bth[2] = cpu_to_be32(mask_psn((u32)txp->txq->sent_txreqs));
+
+	/* Build the deth */
+	ohdr->u.ud.deth[0] = cpu_to_be32(priv->qkey);
+	ohdr->u.ud.deth[1] = cpu_to_be32((txp->entropy <<
+					  HFI1_IPOIB_ENTROPY_SHIFT) | sqpn);
+
+	/* Construct the pbc. */
+	sdma_hdr->pbc =
+		cpu_to_le64(create_pbc(ppd,
+				       ib_is_sc5(txp->flow.sc5) <<
+							      PBC_DC_INFO_SHIFT,
+				       0,
+				       sc_to_vlt(priv->dd, txp->flow.sc5),
+				       dwords - SIZE_OF_CRC +
+						(sizeof(sdma_hdr->pbc) >> 2)));
+}
+
+static struct ipoib_txreq *hfi1_ipoib_send_dma_common(struct net_device *dev,
+						      struct sk_buff *skb,
+						      struct ipoib_txparms *txp)
+{
+	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+	struct ipoib_txreq *tx;
+	int ret;
+
+	tx = kmem_cache_alloc_node(priv->txreq_cache,
+				   GFP_ATOMIC,
+				   priv->dd->node);
+	if (unlikely(!tx))
+		return ERR_PTR(-ENOMEM);
+
+	/* so that we can test if the sdma descriptors are there */
+	tx->txreq.num_desc = 0;
+	tx->priv = priv;
+	tx->txq = txp->txq;
+	tx->skb = skb;
+	INIT_LIST_HEAD(&tx->txreq.list);
+
+	hfi1_ipoib_build_ib_tx_headers(tx, txp);
+
+	ret = hfi1_ipoib_build_tx_desc(tx, txp);
+	if (likely(!ret)) {
+		if (txp->txq->flow.as_int != txp->flow.as_int) {
+			txp->txq->flow.tx_queue = txp->flow.tx_queue;
+			txp->txq->flow.sc5 = txp->flow.sc5;
+			txp->txq->sde =
+				sdma_select_engine_sc(priv->dd,
+						      txp->flow.tx_queue,
+						      txp->flow.sc5);
+		}
+
+		return tx;
+	}
+
+	sdma_txclean(priv->dd, &tx->txreq);
+	kmem_cache_free(priv->txreq_cache, tx);
+
+	return ERR_PTR(ret);
+}
+
+static int hfi1_ipoib_submit_tx_list(struct net_device *dev,
+				     struct hfi1_ipoib_txq *txq)
+{
+	int ret;
+	u16 count_out;
+
+	ret = sdma_send_txlist(txq->sde,
+			       iowait_get_ib_work(&txq->wait),
+			       &txq->tx_list,
+			       &count_out);
+	if (likely(!ret) || ret == -EBUSY || ret == -ECOMM)
+		return ret;
+
+	dd_dev_warn(txq->priv->dd, "cannot send skb tx list, err %d.\n", ret);
+
+	return ret;
+}
+
+static int hfi1_ipoib_flush_tx_list(struct net_device *dev,
+				    struct hfi1_ipoib_txq *txq)
+{
+	int ret = 0;
+
+	if (!list_empty(&txq->tx_list)) {
+		/* Flush the current list */
+		ret = hfi1_ipoib_submit_tx_list(dev, txq);
+
+		if (unlikely(ret))
+			if (ret != -EBUSY)
+				++dev->stats.tx_carrier_errors;
+	}
+
+	return ret;
+}
+
+static int hfi1_ipoib_submit_tx(struct hfi1_ipoib_txq *txq,
+				struct ipoib_txreq *tx)
+{
+	int ret;
+
+	ret = sdma_send_txreq(txq->sde,
+			      iowait_get_ib_work(&txq->wait),
+			      &tx->txreq,
+			      txq->pkts_sent);
+	if (likely(!ret)) {
+		txq->pkts_sent = true;
+		iowait_starve_clear(txq->pkts_sent, &txq->wait);
+	}
+
+	return ret;
+}
+
+static int hfi1_ipoib_send_dma_single(struct net_device *dev,
+				      struct sk_buff *skb,
+				      struct ipoib_txparms *txp)
+{
+	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+	struct hfi1_ipoib_txq *txq = txp->txq;
+	struct ipoib_txreq *tx;
+	int ret;
+
+	tx = hfi1_ipoib_send_dma_common(dev, skb, txp);
+	if (IS_ERR(tx)) {
+		int ret = PTR_ERR(tx);
+
+		dev_kfree_skb_any(skb);
+
+		if (ret == -ENOMEM)
+			++dev->stats.tx_errors;
+		else
+			++dev->stats.tx_carrier_errors;
+
+		return NETDEV_TX_OK;
+	}
+
+	ret = hfi1_ipoib_submit_tx(txq, tx);
+	if (likely(!ret)) {
+tx_ok:
+		trace_sdma_output_ibhdr(tx->priv->dd,
+					&tx->sdma_hdr.hdr,
+					ib_is_sc5(txp->flow.sc5));
+		hfi1_ipoib_check_queue_depth(txq);
+		return NETDEV_TX_OK;
+	}
+
+	txq->pkts_sent = false;
+
+	if (ret == -EBUSY || ret == -ECOMM)
+		goto tx_ok;
+
+	sdma_txclean(priv->dd, &tx->txreq);
+	dev_kfree_skb_any(skb);
+	kmem_cache_free(priv->txreq_cache, tx);
+	++dev->stats.tx_carrier_errors;
+
+	return NETDEV_TX_OK;
+}
+
+static int hfi1_ipoib_send_dma_list(struct net_device *dev,
+				    struct sk_buff *skb,
+				    struct ipoib_txparms *txp)
+{
+	struct hfi1_ipoib_txq *txq = txp->txq;
+	struct ipoib_txreq *tx;
+
+	/* Has the flow change ? */
+	if (txq->flow.as_int != txp->flow.as_int) {
+		int ret;
+
+		ret = hfi1_ipoib_flush_tx_list(dev, txq);
+		if (unlikely(ret)) {
+			if (ret == -EBUSY)
+				++dev->stats.tx_dropped;
+			dev_kfree_skb_any(skb);
+			return NETDEV_TX_OK;
+		}
+	}
+	tx = hfi1_ipoib_send_dma_common(dev, skb, txp);
+	if (IS_ERR(tx)) {
+		int ret = PTR_ERR(tx);
+
+		dev_kfree_skb_any(skb);
+
+		if (ret == -ENOMEM)
+			++dev->stats.tx_errors;
+		else
+			++dev->stats.tx_carrier_errors;
+
+		return NETDEV_TX_OK;
+	}
+
+	list_add_tail(&tx->txreq.list, &txq->tx_list);
+
+	hfi1_ipoib_check_queue_depth(txq);
+
+	trace_sdma_output_ibhdr(tx->priv->dd,
+				&tx->sdma_hdr.hdr,
+				ib_is_sc5(txp->flow.sc5));
+
+	if (!netdev_xmit_more())
+		(void)hfi1_ipoib_flush_tx_list(dev, txq);
+
+	return NETDEV_TX_OK;
+}
+
+static u8 hfi1_ipoib_calc_entropy(struct sk_buff *skb)
+{
+	if (skb_transport_header_was_set(skb)) {
+		u8 *hdr = (u8 *)skb_transport_header(skb);
+
+		return (hdr[0] ^ hdr[1] ^ hdr[2] ^ hdr[3]);
+	}
+
+	return (u8)skb_get_queue_mapping(skb);
+}
+
+int hfi1_ipoib_send_dma(struct net_device *dev,
+			struct sk_buff *skb,
+			struct ib_ah *address,
+			u32 dqpn)
+{
+	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+	struct ipoib_txparms txp;
+	struct rdma_netdev *rn = netdev_priv(dev);
+
+	if (unlikely(skb->len > rn->mtu + HFI1_IPOIB_ENCAP_LEN)) {
+		dd_dev_warn(priv->dd, "packet len %d (> %d) too long to send, dropping\n",
+			    skb->len,
+			    rn->mtu + HFI1_IPOIB_ENCAP_LEN);
+		++dev->stats.tx_dropped;
+		++dev->stats.tx_errors;
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+
+	txp.dd = priv->dd;
+	txp.ah_attr = &ibah_to_rvtah(address)->attr;
+	txp.ibp = to_iport(priv->device, priv->port_num);
+	txp.txq = &priv->txqs[skb_get_queue_mapping(skb)];
+	txp.dqpn = dqpn;
+	txp.flow.sc5 = txp.ibp->sl_to_sc[rdma_ah_get_sl(txp.ah_attr)];
+	txp.flow.tx_queue = (u8)skb_get_queue_mapping(skb);
+	txp.entropy = hfi1_ipoib_calc_entropy(skb);
+
+	if (netdev_xmit_more() || !list_empty(&txp.txq->tx_list))
+		return hfi1_ipoib_send_dma_list(dev, skb, &txp);
+
+	return hfi1_ipoib_send_dma_single(dev, skb,  &txp);
+}
+
+/*
+ * hfi1_ipoib_sdma_sleep - ipoib sdma sleep function
+ *
+ * This function gets called from sdma_send_txreq() when there are not enough
+ * sdma descriptors available to send the packet. It adds Tx queue's wait
+ * structure to sdma engine's dmawait list to be woken up when descriptors
+ * become available.
+ */
+static int hfi1_ipoib_sdma_sleep(struct sdma_engine *sde,
+				 struct iowait_work *wait,
+				 struct sdma_txreq *txreq,
+				 uint seq,
+				 bool pkts_sent)
+{
+	struct hfi1_ipoib_txq *txq =
+		container_of(wait->iow, struct hfi1_ipoib_txq, wait);
+
+	write_seqlock(&sde->waitlock);
+
+	if (likely(txq->priv->netdev->reg_state == NETREG_REGISTERED)) {
+		if (sdma_progress(sde, seq, txreq)) {
+			write_sequnlock(&sde->waitlock);
+			return -EAGAIN;
+		}
+
+		if (list_empty(&txreq->list))
+			/* came from non-list submit */
+			list_add_tail(&txreq->list, &txq->tx_list);
+		if (list_empty(&txq->wait.list)) {
+			if (!atomic_xchg(&txq->no_desc, 1))
+				hfi1_ipoib_stop_txq(txq);
+			iowait_queue(pkts_sent, wait->iow, &sde->dmawait);
+		}
+
+		write_sequnlock(&sde->waitlock);
+		return -EBUSY;
+	}
+
+	write_sequnlock(&sde->waitlock);
+	return -EINVAL;
+}
+
+/*
+ * hfi1_ipoib_sdma_wakeup - ipoib sdma wakeup function
+ *
+ * This function gets called when SDMA descriptors becomes available and Tx
+ * queue's wait structure was previously added to sdma engine's dmawait list.
+ */
+static void hfi1_ipoib_sdma_wakeup(struct iowait *wait, int reason)
+{
+	struct hfi1_ipoib_txq *txq =
+		container_of(wait, struct hfi1_ipoib_txq, wait);
+
+	if (likely(txq->priv->netdev->reg_state == NETREG_REGISTERED))
+		iowait_schedule(wait, system_highpri_wq, WORK_CPU_UNBOUND);
+}
+
+static void hfi1_ipoib_flush_txq(struct work_struct *work)
+{
+	struct iowait_work *ioww =
+		container_of(work, struct iowait_work, iowork);
+	struct iowait *wait = iowait_ioww_to_iow(ioww);
+	struct hfi1_ipoib_txq *txq =
+		container_of(wait, struct hfi1_ipoib_txq, wait);
+	struct net_device *dev = txq->priv->netdev;
+
+	if (likely(dev->reg_state == NETREG_REGISTERED) &&
+	    likely(!hfi1_ipoib_flush_tx_list(dev, txq)))
+		if (atomic_xchg(&txq->no_desc, 0))
+			hfi1_ipoib_wake_txq(txq);
+}
+
+int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv)
+{
+	struct net_device *dev = priv->netdev;
+	char buf[HFI1_IPOIB_TXREQ_NAME_LEN];
+	unsigned long tx_ring_size;
+	int i;
+
+	/*
+	 * Ring holds 1 less than tx_ring_size
+	 * Round up to next power of 2 in order to hold at least tx_queue_len
+	 */
+	tx_ring_size = roundup_pow_of_two((unsigned long)dev->tx_queue_len + 1);
+
+	snprintf(buf, sizeof(buf), "hfi1_%u_ipoib_txreq_cache", priv->dd->unit);
+	priv->txreq_cache = kmem_cache_create(buf,
+					      sizeof(struct ipoib_txreq),
+					      0,
+					      0,
+					      NULL);
+	if (!priv->txreq_cache)
+		return -ENOMEM;
+
+	priv->tx_napis = kcalloc_node(dev->num_tx_queues,
+				      sizeof(struct napi_struct),
+				      GFP_KERNEL,
+				      priv->dd->node);
+	if (!priv->tx_napis)
+		goto free_txreq_cache;
+
+	priv->txqs = kcalloc_node(dev->num_tx_queues,
+				  sizeof(struct hfi1_ipoib_txq),
+				  GFP_KERNEL,
+				  priv->dd->node);
+	if (!priv->txqs)
+		goto free_tx_napis;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		struct hfi1_ipoib_txq *txq = &priv->txqs[i];
+
+		iowait_init(&txq->wait,
+			    0,
+			    hfi1_ipoib_flush_txq,
+			    NULL,
+			    hfi1_ipoib_sdma_sleep,
+			    hfi1_ipoib_sdma_wakeup,
+			    NULL,
+			    NULL);
+		txq->priv = priv;
+		txq->sde = NULL;
+		INIT_LIST_HEAD(&txq->tx_list);
+		atomic64_set(&txq->complete_txreqs, 0);
+		atomic_set(&txq->stops, 0);
+		atomic_set(&txq->ring_full, 0);
+		atomic_set(&txq->no_desc, 0);
+		txq->q_idx = i;
+		txq->flow.tx_queue = 0xff;
+		txq->flow.sc5 = 0xff;
+		txq->pkts_sent = false;
+
+		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
+					     priv->dd->node);
+
+		txq->tx_ring.items =
+			kcalloc_node(tx_ring_size,
+				     sizeof(struct ipoib_txreq *),
+				     GFP_KERNEL, priv->dd->node);
+		if (!txq->tx_ring.items)
+			goto free_txqs;
+
+		spin_lock_init(&txq->tx_ring.producer_lock);
+		spin_lock_init(&txq->tx_ring.consumer_lock);
+		txq->tx_ring.max_items = tx_ring_size;
+
+		txq->napi = &priv->tx_napis[i];
+		netif_tx_napi_add(dev, txq->napi,
+				  hfi1_ipoib_process_tx_ring,
+				  NAPI_POLL_WEIGHT);
+	}
+
+	return 0;
+
+free_txqs:
+	for (i--; i >= 0; i--) {
+		struct hfi1_ipoib_txq *txq = &priv->txqs[i];
+
+		netif_napi_del(txq->napi);
+		kfree(txq->tx_ring.items);
+	}
+
+	kfree(priv->txqs);
+	priv->txqs = NULL;
+
+free_tx_napis:
+	kfree(priv->tx_napis);
+	priv->tx_napis = NULL;
+
+free_txreq_cache:
+	kmem_cache_destroy(priv->txreq_cache);
+	priv->txreq_cache = NULL;
+	return -ENOMEM;
+}
+
+static void hfi1_ipoib_drain_tx_list(struct hfi1_ipoib_txq *txq)
+{
+	struct sdma_txreq *txreq;
+	struct sdma_txreq *txreq_tmp;
+	atomic64_t *complete_txreqs = &txq->complete_txreqs;
+
+	list_for_each_entry_safe(txreq, txreq_tmp, &txq->tx_list, list) {
+		struct ipoib_txreq *tx =
+			container_of(txreq, struct ipoib_txreq, txreq);
+
+		list_del(&txreq->list);
+		sdma_txclean(txq->priv->dd, &tx->txreq);
+		dev_kfree_skb_any(tx->skb);
+		kmem_cache_free(txq->priv->txreq_cache, tx);
+		atomic64_inc(complete_txreqs);
+	}
+
+	if (hfi1_ipoib_used(txq))
+		dd_dev_warn(txq->priv->dd,
+			    "txq %d not empty found %llu requests\n",
+			    txq->q_idx,
+			    hfi1_ipoib_txreqs(txq->sent_txreqs,
+					      atomic64_read(complete_txreqs)));
+}
+
+void hfi1_ipoib_txreq_deinit(struct hfi1_ipoib_dev_priv *priv)
+{
+	int i;
+
+	for (i = 0; i < priv->netdev->num_tx_queues; i++) {
+		struct hfi1_ipoib_txq *txq = &priv->txqs[i];
+
+		iowait_cancel_work(&txq->wait);
+		iowait_sdma_drain(&txq->wait);
+		hfi1_ipoib_drain_tx_list(txq);
+		netif_napi_del(txq->napi);
+		(void)hfi1_ipoib_drain_tx_ring(txq, txq->tx_ring.max_items);
+		kfree(txq->tx_ring.items);
+	}
+
+	kfree(priv->txqs);
+	priv->txqs = NULL;
+
+	kfree(priv->tx_napis);
+	priv->tx_napis = NULL;
+
+	kmem_cache_destroy(priv->txreq_cache);
+	priv->txreq_cache = NULL;
+}
+
+void hfi1_ipoib_napi_tx_enable(struct net_device *dev)
+{
+	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+	int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		struct hfi1_ipoib_txq *txq = &priv->txqs[i];
+
+		napi_enable(txq->napi);
+	}
+}
+
+void hfi1_ipoib_napi_tx_disable(struct net_device *dev)
+{
+	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+	int i;
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		struct hfi1_ipoib_txq *txq = &priv->txqs[i];
+
+		napi_disable(txq->napi);
+		(void)hfi1_ipoib_drain_tx_ring(txq, txq->tx_ring.max_items);
+	}
+}
diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c
index d8ff063..3222e3a 100644
--- a/drivers/infiniband/hw/hfi1/mad.c
+++ b/drivers/infiniband/hw/hfi1/mad.c
@@ -721,7 +721,7 @@
 			/* Bad mkey not a violation below level 2 */
 			if (ibp->rvp.mkeyprot < 2)
 				break;
-			/* fall through */
+			fallthrough;
 		case IB_MGMT_METHOD_SET:
 		case IB_MGMT_METHOD_TRAP_REPRESS:
 			if (ibp->rvp.mkey_violations != 0xFFFF)
@@ -1272,7 +1272,7 @@
 	case IB_PORT_NOP:
 		if (phys_state == IB_PORTPHYSSTATE_NOP)
 			break;
-		/* FALLTHROUGH */
+		fallthrough;
 	case IB_PORT_DOWN:
 		if (phys_state == IB_PORTPHYSSTATE_NOP) {
 			link_state = HLS_DN_DOWNDEF;
@@ -2300,7 +2300,6 @@
 	 * can be changed from the default values
 	 */
 	case OPA_VLARB_PREEMPT_ELEMENTS:
-		/* FALLTHROUGH */
 	case OPA_VLARB_PREEMPT_MATRIX:
 		smp->status |= IB_SMP_UNSUP_METH_ATTR;
 		break;
@@ -2381,7 +2380,7 @@
 		__be64 port_vl_rcv_bubble;
 		__be64 port_vl_mark_fecn;
 		__be64 port_vl_xmit_discards;
-	} vls[0]; /* real array size defined by # bits set in vl_select_mask */
+	} vls[]; /* real array size defined by # bits set in vl_select_mask */
 };
 
 enum counter_selects {
@@ -2423,7 +2422,7 @@
 	__be16 attr_id;
 	__be16 err_reqlength;	/* 1 bit, 8 res, 7 bit */
 	__be32 attr_mod;
-	u8 data[0];
+	u8 data[];
 };
 
 #define MSK_LLI 0x000000f0
@@ -4170,7 +4169,7 @@
 			return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
 		if (ibp->rvp.port_cap_flags & IB_PORT_SM)
 			return IB_MAD_RESULT_SUCCESS;
-		/* FALLTHROUGH */
+		fallthrough;
 	default:
 		smp->status |= IB_SMP_UNSUP_METH_ATTR;
 		ret = reply((struct ib_mad_hdr *)smp);
@@ -4240,7 +4239,7 @@
 			return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
 		if (ibp->rvp.port_cap_flags & IB_PORT_SM)
 			return IB_MAD_RESULT_SUCCESS;
-		/* FALLTHROUGH */
+		fallthrough;
 	default:
 		smp->status |= IB_SMP_UNSUP_METH_ATTR;
 		ret = reply((struct ib_mad_hdr *)smp);
@@ -4915,16 +4914,11 @@
  */
 int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
 		     const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-		     const struct ib_mad_hdr *in_mad, size_t in_mad_size,
-		     struct ib_mad_hdr *out_mad, size_t *out_mad_size,
-		     u16 *out_mad_pkey_index)
+		     const struct ib_mad *in_mad, struct ib_mad *out_mad,
+		     size_t *out_mad_size, u16 *out_mad_pkey_index)
 {
-	switch (in_mad->base_version) {
+	switch (in_mad->mad_hdr.base_version) {
 	case OPA_MGMT_BASE_VERSION:
-		if (unlikely(in_mad_size != sizeof(struct opa_mad))) {
-			dev_err(ibdev->dev.parent, "invalid in_mad_size\n");
-			return IB_MAD_RESULT_FAILURE;
-		}
 		return hfi1_process_opa_mad(ibdev, mad_flags, port,
 					    in_wc, in_grh,
 					    (struct opa_mad *)in_mad,
@@ -4932,10 +4926,8 @@
 					    out_mad_size,
 					    out_mad_pkey_index);
 	case IB_MGMT_BASE_VERSION:
-		return hfi1_process_ib_mad(ibdev, mad_flags, port,
-					  in_wc, in_grh,
-					  (const struct ib_mad *)in_mad,
-					  (struct ib_mad *)out_mad);
+		return hfi1_process_ib_mad(ibdev, mad_flags, port, in_wc,
+					   in_grh, in_mad, out_mad);
 	default:
 		break;
 	}
diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h
index 2f48e69..889e63d 100644
--- a/drivers/infiniband/hw/hfi1/mad.h
+++ b/drivers/infiniband/hw/hfi1/mad.h
@@ -165,7 +165,7 @@
 		} __packed ntc_2048;
 
 	};
-	u8	class_data[0];
+	u8	class_data[];
 };
 
 #define IB_VLARB_LOWPRI_0_31    1
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c
index 14d2a90..d213f65 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright(c) 2020 Cornelis Networks, Inc.
  * Copyright(c) 2016 - 2017 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
@@ -48,23 +49,11 @@
 #include <linux/rculist.h>
 #include <linux/mmu_notifier.h>
 #include <linux/interval_tree_generic.h>
+#include <linux/sched/mm.h>
 
 #include "mmu_rb.h"
 #include "trace.h"
 
-struct mmu_rb_handler {
-	struct mmu_notifier mn;
-	struct rb_root_cached root;
-	void *ops_arg;
-	spinlock_t lock;        /* protect the RB tree */
-	struct mmu_rb_ops *ops;
-	struct mm_struct *mm;
-	struct list_head lru_list;
-	struct work_struct del_work;
-	struct list_head del_list;
-	struct workqueue_struct *wq;
-};
-
 static unsigned long mmu_node_start(struct mmu_rb_node *);
 static unsigned long mmu_node_last(struct mmu_rb_node *);
 static int mmu_notifier_range_start(struct mmu_notifier *,
@@ -92,37 +81,36 @@
 	return PAGE_ALIGN(node->addr + node->len) - 1;
 }
 
-int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
+int hfi1_mmu_rb_register(void *ops_arg,
 			 struct mmu_rb_ops *ops,
 			 struct workqueue_struct *wq,
 			 struct mmu_rb_handler **handler)
 {
-	struct mmu_rb_handler *handlr;
+	struct mmu_rb_handler *h;
 	int ret;
 
-	handlr = kmalloc(sizeof(*handlr), GFP_KERNEL);
-	if (!handlr)
+	h = kzalloc(sizeof(*h), GFP_KERNEL);
+	if (!h)
 		return -ENOMEM;
 
-	handlr->root = RB_ROOT_CACHED;
-	handlr->ops = ops;
-	handlr->ops_arg = ops_arg;
-	INIT_HLIST_NODE(&handlr->mn.hlist);
-	spin_lock_init(&handlr->lock);
-	handlr->mn.ops = &mn_opts;
-	handlr->mm = mm;
-	INIT_WORK(&handlr->del_work, handle_remove);
-	INIT_LIST_HEAD(&handlr->del_list);
-	INIT_LIST_HEAD(&handlr->lru_list);
-	handlr->wq = wq;
+	h->root = RB_ROOT_CACHED;
+	h->ops = ops;
+	h->ops_arg = ops_arg;
+	INIT_HLIST_NODE(&h->mn.hlist);
+	spin_lock_init(&h->lock);
+	h->mn.ops = &mn_opts;
+	INIT_WORK(&h->del_work, handle_remove);
+	INIT_LIST_HEAD(&h->del_list);
+	INIT_LIST_HEAD(&h->lru_list);
+	h->wq = wq;
 
-	ret = mmu_notifier_register(&handlr->mn, handlr->mm);
+	ret = mmu_notifier_register(&h->mn, current->mm);
 	if (ret) {
-		kfree(handlr);
+		kfree(h);
 		return ret;
 	}
 
-	*handler = handlr;
+	*handler = h;
 	return 0;
 }
 
@@ -134,7 +122,7 @@
 	struct list_head del_list;
 
 	/* Unregister first so we don't get any more notifications. */
-	mmu_notifier_unregister(&handler->mn, handler->mm);
+	mmu_notifier_unregister(&handler->mn, handler->mn.mm);
 
 	/*
 	 * Make sure the wq delete handler is finished running.  It will not
@@ -166,6 +154,10 @@
 	int ret = 0;
 
 	trace_hfi1_mmu_rb_insert(mnode->addr, mnode->len);
+
+	if (current->mm != handler->mn.mm)
+		return -EPERM;
+
 	spin_lock_irqsave(&handler->lock, flags);
 	node = __mmu_rb_search(handler, mnode->addr, mnode->len);
 	if (node) {
@@ -180,6 +172,7 @@
 		__mmu_int_rb_remove(mnode, &handler->root);
 		list_del(&mnode->list); /* remove from LRU list */
 	}
+	mnode->handler = handler;
 unlock:
 	spin_unlock_irqrestore(&handler->lock, flags);
 	return ret;
@@ -217,6 +210,9 @@
 	unsigned long flags;
 	bool ret = false;
 
+	if (current->mm != handler->mn.mm)
+		return ret;
+
 	spin_lock_irqsave(&handler->lock, flags);
 	node = __mmu_rb_search(handler, addr, len);
 	if (node) {
@@ -239,6 +235,9 @@
 	unsigned long flags;
 	bool stop = false;
 
+	if (current->mm != handler->mn.mm)
+		return;
+
 	INIT_LIST_HEAD(&del_list);
 
 	spin_lock_irqsave(&handler->lock, flags);
@@ -272,6 +271,9 @@
 {
 	unsigned long flags;
 
+	if (current->mm != handler->mn.mm)
+		return;
+
 	/* Validity of handler and node pointers has been checked by caller. */
 	trace_hfi1_mmu_rb_remove(node->addr, node->len);
 	spin_lock_irqsave(&handler->lock, flags);
@@ -333,7 +335,7 @@
 
 /*
  * Work queue function to remove all nodes that have been queued up to
- * be removed.  The key feature is that mm->mmap_sem is not being held
+ * be removed.  The key feature is that mm->mmap_lock is not being held
  * and the remove callback can sleep while taking it, if needed.
  */
 static void handle_remove(struct work_struct *work)
diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h
index f04cec1..423aacc 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.h
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright(c) 2020 Cornelis Networks, Inc.
  * Copyright(c) 2016 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
@@ -54,6 +55,7 @@
 	unsigned long len;
 	unsigned long __last;
 	struct rb_node node;
+	struct mmu_rb_handler *handler;
 	struct list_head list;
 };
 
@@ -71,7 +73,19 @@
 		     void *evict_arg, bool *stop);
 };
 
-int hfi1_mmu_rb_register(void *ops_arg, struct mm_struct *mm,
+struct mmu_rb_handler {
+	struct mmu_notifier mn;
+	struct rb_root_cached root;
+	void *ops_arg;
+	spinlock_t lock;        /* protect the RB tree */
+	struct mmu_rb_ops *ops;
+	struct list_head lru_list;
+	struct work_struct del_work;
+	struct list_head del_list;
+	struct workqueue_struct *wq;
+};
+
+int hfi1_mmu_rb_register(void *ops_arg,
 			 struct mmu_rb_ops *ops,
 			 struct workqueue_struct *wq,
 			 struct mmu_rb_handler **handler);
diff --git a/drivers/infiniband/hw/hfi1/msix.c b/drivers/infiniband/hw/hfi1/msix.c
index d920b16..d61ee85 100644
--- a/drivers/infiniband/hw/hfi1/msix.c
+++ b/drivers/infiniband/hw/hfi1/msix.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
 /*
- * Copyright(c) 2018 Intel Corporation.
+ * Copyright(c) 2018 - 2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -49,6 +49,7 @@
 #include "hfi.h"
 #include "affinity.h"
 #include "sdma.h"
+#include "netdev.h"
 
 /**
  * msix_initialize() - Calculate, request and configure MSIx IRQs
@@ -69,7 +70,7 @@
 	 *	one for each VNIC context
 	 *      ...any new IRQs should be added here.
 	 */
-	total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts;
+	total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_netdev_contexts;
 
 	if (total >= CCE_NUM_MSIX_VECTORS)
 		return -EINVAL;
@@ -115,13 +116,11 @@
  */
 static int msix_request_irq(struct hfi1_devdata *dd, void *arg,
 			    irq_handler_t handler, irq_handler_t thread,
-			    u32 idx, enum irq_type type)
+			    enum irq_type type, const char *name)
 {
 	unsigned long nr;
 	int irq;
 	int ret;
-	const char *err_info;
-	char name[MAX_NAME_SIZE];
 	struct hfi1_msix_entry *me;
 
 	/* Allocate an MSIx vector */
@@ -135,43 +134,15 @@
 	if (nr == dd->msix_info.max_requested)
 		return -ENOSPC;
 
-	/* Specific verification and determine the name */
-	switch (type) {
-	case IRQ_GENERAL:
-		/* general interrupt must be MSIx vector 0 */
-		if (nr) {
-			spin_lock(&dd->msix_info.msix_lock);
-			__clear_bit(nr, dd->msix_info.in_use_msix);
-			spin_unlock(&dd->msix_info.msix_lock);
-			dd_dev_err(dd, "Invalid index %lu for GENERAL IRQ\n",
-				   nr);
-			return -EINVAL;
-		}
-		snprintf(name, sizeof(name), DRIVER_NAME "_%d", dd->unit);
-		err_info = "general";
-		break;
-	case IRQ_SDMA:
-		snprintf(name, sizeof(name), DRIVER_NAME "_%d sdma%d",
-			 dd->unit, idx);
-		err_info = "sdma";
-		break;
-	case IRQ_RCVCTXT:
-		snprintf(name, sizeof(name), DRIVER_NAME "_%d kctxt%d",
-			 dd->unit, idx);
-		err_info = "receive context";
-		break;
-	case IRQ_OTHER:
-	default:
+	if (type < IRQ_SDMA || type >= IRQ_OTHER)
 		return -EINVAL;
-	}
-	name[sizeof(name) - 1] = 0;
 
 	irq = pci_irq_vector(dd->pcidev, nr);
 	ret = pci_request_irq(dd->pcidev, nr, handler, thread, arg, name);
 	if (ret) {
 		dd_dev_err(dd,
-			   "%s: request for IRQ %d failed, MSIx %d, err %d\n",
-			   err_info, irq, idx, ret);
+			   "%s: request for IRQ %d failed, MSIx %lx, err %d\n",
+			   name, irq, nr, ret);
 		spin_lock(&dd->msix_info.msix_lock);
 		__clear_bit(nr, dd->msix_info.in_use_msix);
 		spin_unlock(&dd->msix_info.msix_lock);
@@ -190,22 +161,19 @@
 	/* This is a request, so a failure is not fatal */
 	ret = hfi1_get_irq_affinity(dd, me);
 	if (ret)
-		dd_dev_err(dd, "unable to pin IRQ %d\n", ret);
+		dd_dev_err(dd, "%s: unable to pin IRQ %d\n", name, ret);
 
 	return nr;
 }
 
-/**
- * msix_request_rcd_irq() - Helper function for RCVAVAIL IRQs
- * @rcd: valid rcd context
- *
- */
-int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd)
+static int msix_request_rcd_irq_common(struct hfi1_ctxtdata *rcd,
+				       irq_handler_t handler,
+				       irq_handler_t thread,
+				       const char *name)
 {
-	int nr;
-
-	nr = msix_request_irq(rcd->dd, rcd, receive_context_interrupt,
-			      receive_context_thread, rcd->ctxt, IRQ_RCVCTXT);
+	int nr = msix_request_irq(rcd->dd, rcd, handler, thread,
+				  rcd->is_vnic ? IRQ_NETDEVCTXT : IRQ_RCVCTXT,
+				  name);
 	if (nr < 0)
 		return nr;
 
@@ -222,6 +190,37 @@
 }
 
 /**
+ * msix_request_rcd_irq() - Helper function for RCVAVAIL IRQs
+ * @rcd: valid rcd context
+ *
+ */
+int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd)
+{
+	char name[MAX_NAME_SIZE];
+
+	snprintf(name, sizeof(name), DRIVER_NAME "_%d kctxt%d",
+		 rcd->dd->unit, rcd->ctxt);
+
+	return msix_request_rcd_irq_common(rcd, receive_context_interrupt,
+					   receive_context_thread, name);
+}
+
+/**
+ * msix_request_rcd_irq() - Helper function for RCVAVAIL IRQs
+ * for netdev context
+ * @rcd: valid netdev contexti
+ */
+int msix_netdev_request_rcd_irq(struct hfi1_ctxtdata *rcd)
+{
+	char name[MAX_NAME_SIZE];
+
+	snprintf(name, sizeof(name), DRIVER_NAME "_%d nd kctxt%d",
+		 rcd->dd->unit, rcd->ctxt);
+	return msix_request_rcd_irq_common(rcd, receive_context_interrupt_napi,
+					   NULL, name);
+}
+
+/**
  * msix_request_smda_ira() - Helper for getting SDMA IRQ resources
  * @sde: valid sdma engine
  *
@@ -229,9 +228,12 @@
 int msix_request_sdma_irq(struct sdma_engine *sde)
 {
 	int nr;
+	char name[MAX_NAME_SIZE];
 
+	snprintf(name, sizeof(name), DRIVER_NAME "_%d sdma%d",
+		 sde->dd->unit, sde->this_idx);
 	nr = msix_request_irq(sde->dd, sde, sdma_interrupt, NULL,
-			      sde->this_idx, IRQ_SDMA);
+			      IRQ_SDMA, name);
 	if (nr < 0)
 		return nr;
 	sde->msix_intr = nr;
@@ -241,6 +243,32 @@
 }
 
 /**
+ * msix_request_general_irq(void) - Helper for getting general IRQ
+ * resources
+ * @dd: valid device data
+ */
+int msix_request_general_irq(struct hfi1_devdata *dd)
+{
+	int nr;
+	char name[MAX_NAME_SIZE];
+
+	snprintf(name, sizeof(name), DRIVER_NAME "_%d", dd->unit);
+	nr = msix_request_irq(dd, dd, general_interrupt, NULL, IRQ_GENERAL,
+			      name);
+	if (nr < 0)
+		return nr;
+
+	/* general interrupt must be MSIx vector 0 */
+	if (nr) {
+		msix_free_irq(dd, (u8)nr);
+		dd_dev_err(dd, "Invalid index %d for GENERAL IRQ\n", nr);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
  * enable_sdma_src() - Helper to enable SDMA IRQ srcs
  * @dd: valid devdata structure
  * @i: index of SDMA engine
@@ -265,10 +293,9 @@
 int msix_request_irqs(struct hfi1_devdata *dd)
 {
 	int i;
-	int ret;
+	int ret = msix_request_general_irq(dd);
 
-	ret = msix_request_irq(dd, dd, general_interrupt, NULL, 0, IRQ_GENERAL);
-	if (ret < 0)
+	if (ret)
 		return ret;
 
 	for (i = 0; i < dd->num_sdma; i++) {
@@ -345,15 +372,16 @@
 }
 
 /**
- * msix_vnic_syncrhonize_irq() - Vnic IRQ synchronize
+ * msix_netdev_syncrhonize_irq() - netdev IRQ synchronize
  * @dd: valid devdata
  */
-void msix_vnic_synchronize_irq(struct hfi1_devdata *dd)
+void msix_netdev_synchronize_irq(struct hfi1_devdata *dd)
 {
 	int i;
+	int ctxt_count = hfi1_netdev_ctxt_count(dd);
 
-	for (i = 0; i < dd->vnic.num_ctxt; i++) {
-		struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i];
+	for (i = 0; i < ctxt_count; i++) {
+		struct hfi1_ctxtdata *rcd = hfi1_netdev_get_ctxt(dd, i);
 		struct hfi1_msix_entry *me;
 
 		me = &dd->msix_info.msix_entries[rcd->msix_intr];
diff --git a/drivers/infiniband/hw/hfi1/msix.h b/drivers/infiniband/hw/hfi1/msix.h
index a514881..e63e944 100644
--- a/drivers/infiniband/hw/hfi1/msix.h
+++ b/drivers/infiniband/hw/hfi1/msix.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
 /*
- * Copyright(c) 2018 Intel Corporation.
+ * Copyright(c) 2018 - 2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -54,11 +54,13 @@
 int msix_initialize(struct hfi1_devdata *dd);
 int msix_request_irqs(struct hfi1_devdata *dd);
 void msix_clean_up_interrupts(struct hfi1_devdata *dd);
+int msix_request_general_irq(struct hfi1_devdata *dd);
 int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd);
 int msix_request_sdma_irq(struct sdma_engine *sde);
 void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr);
 
-/* VNIC interface */
-void msix_vnic_synchronize_irq(struct hfi1_devdata *dd);
+/* Netdev interface */
+void msix_netdev_synchronize_irq(struct hfi1_devdata *dd);
+int msix_netdev_request_rcd_irq(struct hfi1_ctxtdata *rcd);
 
 #endif
diff --git a/drivers/infiniband/hw/hfi1/netdev.h b/drivers/infiniband/hw/hfi1/netdev.h
new file mode 100644
index 0000000..947543a
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/netdev.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/*
+ * Copyright(c) 2020 Intel Corporation.
+ *
+ */
+
+#ifndef HFI1_NETDEV_H
+#define HFI1_NETDEV_H
+
+#include "hfi.h"
+
+#include <linux/netdevice.h>
+#include <linux/xarray.h>
+
+/**
+ * struct hfi1_netdev_rxq - Receive Queue for HFI
+ * dummy netdev. Both IPoIB and VNIC netdevices will be working on
+ * top of this device.
+ * @napi: napi object
+ * @priv: ptr to netdev_priv
+ * @rcd:  ptr to receive context data
+ */
+struct hfi1_netdev_rxq {
+	struct napi_struct napi;
+	struct hfi1_netdev_priv *priv;
+	struct hfi1_ctxtdata *rcd;
+};
+
+/*
+ * Number of netdev contexts used. Ensure it is less than or equal to
+ * max queues supported by VNIC (HFI1_VNIC_MAX_QUEUE).
+ */
+#define HFI1_MAX_NETDEV_CTXTS   8
+
+/* Number of NETDEV RSM entries */
+#define NUM_NETDEV_MAP_ENTRIES HFI1_MAX_NETDEV_CTXTS
+
+/**
+ * struct hfi1_netdev_priv: data required to setup and run HFI netdev.
+ * @dd:		hfi1_devdata
+ * @rxq:	pointer to dummy netdev receive queues.
+ * @num_rx_q:	number of receive queues
+ * @rmt_index:	first free index in RMT Array
+ * @msix_start: first free MSI-X interrupt vector.
+ * @dev_tbl:	netdev table for unique identifier VNIC and IPoIb VLANs.
+ * @enabled:	atomic counter of netdevs enabling receive queues.
+ *		When 0 NAPI will be disabled.
+ * @netdevs:	atomic counter of netdevs using dummy netdev.
+ *		When 0 receive queues will be freed.
+ */
+struct hfi1_netdev_priv {
+	struct hfi1_devdata *dd;
+	struct hfi1_netdev_rxq *rxq;
+	int num_rx_q;
+	int rmt_start;
+	struct xarray dev_tbl;
+	/* count of enabled napi polls */
+	atomic_t enabled;
+	/* count of netdevs on top */
+	atomic_t netdevs;
+};
+
+static inline
+struct hfi1_netdev_priv *hfi1_netdev_priv(struct net_device *dev)
+{
+	return (struct hfi1_netdev_priv *)&dev[1];
+}
+
+static inline
+int hfi1_netdev_ctxt_count(struct hfi1_devdata *dd)
+{
+	struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+
+	return priv->num_rx_q;
+}
+
+static inline
+struct hfi1_ctxtdata *hfi1_netdev_get_ctxt(struct hfi1_devdata *dd, int ctxt)
+{
+	struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+
+	return priv->rxq[ctxt].rcd;
+}
+
+static inline
+int hfi1_netdev_get_free_rmt_idx(struct hfi1_devdata *dd)
+{
+	struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+
+	return priv->rmt_start;
+}
+
+static inline
+void hfi1_netdev_set_free_rmt_idx(struct hfi1_devdata *dd, int rmt_idx)
+{
+	struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+
+	priv->rmt_start = rmt_idx;
+}
+
+u32 hfi1_num_netdev_contexts(struct hfi1_devdata *dd, u32 available_contexts,
+			     struct cpumask *cpu_mask);
+
+void hfi1_netdev_enable_queues(struct hfi1_devdata *dd);
+void hfi1_netdev_disable_queues(struct hfi1_devdata *dd);
+int hfi1_netdev_rx_init(struct hfi1_devdata *dd);
+int hfi1_netdev_rx_destroy(struct hfi1_devdata *dd);
+int hfi1_netdev_alloc(struct hfi1_devdata *dd);
+void hfi1_netdev_free(struct hfi1_devdata *dd);
+int hfi1_netdev_add_data(struct hfi1_devdata *dd, int id, void *data);
+void *hfi1_netdev_remove_data(struct hfi1_devdata *dd, int id);
+void *hfi1_netdev_get_data(struct hfi1_devdata *dd, int id);
+void *hfi1_netdev_get_first_data(struct hfi1_devdata *dd, int *start_id);
+
+/* chip.c  */
+int hfi1_netdev_rx_napi(struct napi_struct *napi, int budget);
+
+#endif /* HFI1_NETDEV_H */
diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c
new file mode 100644
index 0000000..ea95baa
--- /dev/null
+++ b/drivers/infiniband/hw/hfi1/netdev_rx.c
@@ -0,0 +1,480 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/*
+ * Copyright(c) 2020 Intel Corporation.
+ *
+ */
+
+/*
+ * This file contains HFI1 support for netdev RX functionality
+ */
+
+#include "sdma.h"
+#include "verbs.h"
+#include "netdev.h"
+#include "hfi.h"
+
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <rdma/ib_verbs.h>
+
+static int hfi1_netdev_setup_ctxt(struct hfi1_netdev_priv *priv,
+				  struct hfi1_ctxtdata *uctxt)
+{
+	unsigned int rcvctrl_ops;
+	struct hfi1_devdata *dd = priv->dd;
+	int ret;
+
+	uctxt->rhf_rcv_function_map = netdev_rhf_rcv_functions;
+	uctxt->do_interrupt = &handle_receive_interrupt_napi_sp;
+
+	/* Now allocate the RcvHdr queue and eager buffers. */
+	ret = hfi1_create_rcvhdrq(dd, uctxt);
+	if (ret)
+		goto done;
+
+	ret = hfi1_setup_eagerbufs(uctxt);
+	if (ret)
+		goto done;
+
+	clear_rcvhdrtail(uctxt);
+
+	rcvctrl_ops = HFI1_RCVCTRL_CTXT_DIS;
+	rcvctrl_ops |= HFI1_RCVCTRL_INTRAVAIL_DIS;
+
+	if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
+		rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
+	if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
+		rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
+	if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
+		rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
+	if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
+		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
+
+	hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt);
+done:
+	return ret;
+}
+
+static int hfi1_netdev_allocate_ctxt(struct hfi1_devdata *dd,
+				     struct hfi1_ctxtdata **ctxt)
+{
+	struct hfi1_ctxtdata *uctxt;
+	int ret;
+
+	if (dd->flags & HFI1_FROZEN)
+		return -EIO;
+
+	ret = hfi1_create_ctxtdata(dd->pport, dd->node, &uctxt);
+	if (ret < 0) {
+		dd_dev_err(dd, "Unable to create ctxtdata, failing open\n");
+		return -ENOMEM;
+	}
+
+	uctxt->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
+		HFI1_CAP_KGET(NODROP_RHQ_FULL) |
+		HFI1_CAP_KGET(NODROP_EGR_FULL) |
+		HFI1_CAP_KGET(DMA_RTAIL);
+	/* Netdev contexts are always NO_RDMA_RTAIL */
+	uctxt->fast_handler = handle_receive_interrupt_napi_fp;
+	uctxt->slow_handler = handle_receive_interrupt_napi_sp;
+	hfi1_set_seq_cnt(uctxt, 1);
+	uctxt->is_vnic = true;
+
+	hfi1_stats.sps_ctxts++;
+
+	dd_dev_info(dd, "created netdev context %d\n", uctxt->ctxt);
+	*ctxt = uctxt;
+
+	return 0;
+}
+
+static void hfi1_netdev_deallocate_ctxt(struct hfi1_devdata *dd,
+					struct hfi1_ctxtdata *uctxt)
+{
+	flush_wc();
+
+	/*
+	 * Disable receive context and interrupt available, reset all
+	 * RcvCtxtCtrl bits to default values.
+	 */
+	hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
+		     HFI1_RCVCTRL_TIDFLOW_DIS |
+		     HFI1_RCVCTRL_INTRAVAIL_DIS |
+		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
+		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
+		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt);
+
+	if (uctxt->msix_intr != CCE_NUM_MSIX_VECTORS)
+		msix_free_irq(dd, uctxt->msix_intr);
+
+	uctxt->msix_intr = CCE_NUM_MSIX_VECTORS;
+	uctxt->event_flags = 0;
+
+	hfi1_clear_tids(uctxt);
+	hfi1_clear_ctxt_pkey(dd, uctxt);
+
+	hfi1_stats.sps_ctxts--;
+
+	hfi1_free_ctxt(uctxt);
+}
+
+static int hfi1_netdev_allot_ctxt(struct hfi1_netdev_priv *priv,
+				  struct hfi1_ctxtdata **ctxt)
+{
+	int rc;
+	struct hfi1_devdata *dd = priv->dd;
+
+	rc = hfi1_netdev_allocate_ctxt(dd, ctxt);
+	if (rc) {
+		dd_dev_err(dd, "netdev ctxt alloc failed %d\n", rc);
+		return rc;
+	}
+
+	rc = hfi1_netdev_setup_ctxt(priv, *ctxt);
+	if (rc) {
+		dd_dev_err(dd, "netdev ctxt setup failed %d\n", rc);
+		hfi1_netdev_deallocate_ctxt(dd, *ctxt);
+		*ctxt = NULL;
+	}
+
+	return rc;
+}
+
+/**
+ * hfi1_num_netdev_contexts - Count of netdev recv contexts to use.
+ * @dd: device on which to allocate netdev contexts
+ * @available_contexts: count of available receive contexts
+ * @cpu_mask: mask of possible cpus to include for contexts
+ *
+ * Return: count of physical cores on a node or the remaining available recv
+ * contexts for netdev recv context usage up to the maximum of
+ * HFI1_MAX_NETDEV_CTXTS.
+ * A value of 0 can be returned when acceleration is explicitly turned off,
+ * a memory allocation error occurs or when there are no available contexts.
+ *
+ */
+u32 hfi1_num_netdev_contexts(struct hfi1_devdata *dd, u32 available_contexts,
+			     struct cpumask *cpu_mask)
+{
+	cpumask_var_t node_cpu_mask;
+	unsigned int available_cpus;
+
+	if (!HFI1_CAP_IS_KSET(AIP))
+		return 0;
+
+	/* Always give user contexts priority over netdev contexts */
+	if (available_contexts == 0) {
+		dd_dev_info(dd, "No receive contexts available for netdevs.\n");
+		return 0;
+	}
+
+	if (!zalloc_cpumask_var(&node_cpu_mask, GFP_KERNEL)) {
+		dd_dev_err(dd, "Unable to allocate cpu_mask for netdevs.\n");
+		return 0;
+	}
+
+	cpumask_and(node_cpu_mask, cpu_mask, cpumask_of_node(dd->node));
+
+	available_cpus = cpumask_weight(node_cpu_mask);
+
+	free_cpumask_var(node_cpu_mask);
+
+	return min3(available_cpus, available_contexts,
+		    (u32)HFI1_MAX_NETDEV_CTXTS);
+}
+
+static int hfi1_netdev_rxq_init(struct net_device *dev)
+{
+	int i;
+	int rc;
+	struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dev);
+	struct hfi1_devdata *dd = priv->dd;
+
+	priv->num_rx_q = dd->num_netdev_contexts;
+	priv->rxq = kcalloc_node(priv->num_rx_q, sizeof(struct hfi1_netdev_rxq),
+				 GFP_KERNEL, dd->node);
+
+	if (!priv->rxq) {
+		dd_dev_err(dd, "Unable to allocate netdev queue data\n");
+		return (-ENOMEM);
+	}
+
+	for (i = 0; i < priv->num_rx_q; i++) {
+		struct hfi1_netdev_rxq *rxq = &priv->rxq[i];
+
+		rc = hfi1_netdev_allot_ctxt(priv, &rxq->rcd);
+		if (rc)
+			goto bail_context_irq_failure;
+
+		hfi1_rcd_get(rxq->rcd);
+		rxq->priv = priv;
+		rxq->rcd->napi = &rxq->napi;
+		dd_dev_info(dd, "Setting rcv queue %d napi to context %d\n",
+			    i, rxq->rcd->ctxt);
+		/*
+		 * Disable BUSY_POLL on this NAPI as this is not supported
+		 * right now.
+		 */
+		set_bit(NAPI_STATE_NO_BUSY_POLL, &rxq->napi.state);
+		netif_napi_add(dev, &rxq->napi, hfi1_netdev_rx_napi, 64);
+		rc = msix_netdev_request_rcd_irq(rxq->rcd);
+		if (rc)
+			goto bail_context_irq_failure;
+	}
+
+	return 0;
+
+bail_context_irq_failure:
+	dd_dev_err(dd, "Unable to allot receive context\n");
+	for (; i >= 0; i--) {
+		struct hfi1_netdev_rxq *rxq = &priv->rxq[i];
+
+		if (rxq->rcd) {
+			hfi1_netdev_deallocate_ctxt(dd, rxq->rcd);
+			hfi1_rcd_put(rxq->rcd);
+			rxq->rcd = NULL;
+		}
+	}
+	kfree(priv->rxq);
+	priv->rxq = NULL;
+
+	return rc;
+}
+
+static void hfi1_netdev_rxq_deinit(struct net_device *dev)
+{
+	int i;
+	struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dev);
+	struct hfi1_devdata *dd = priv->dd;
+
+	for (i = 0; i < priv->num_rx_q; i++) {
+		struct hfi1_netdev_rxq *rxq = &priv->rxq[i];
+
+		netif_napi_del(&rxq->napi);
+		hfi1_netdev_deallocate_ctxt(dd, rxq->rcd);
+		hfi1_rcd_put(rxq->rcd);
+		rxq->rcd = NULL;
+	}
+
+	kfree(priv->rxq);
+	priv->rxq = NULL;
+	priv->num_rx_q = 0;
+}
+
+static void enable_queues(struct hfi1_netdev_priv *priv)
+{
+	int i;
+
+	for (i = 0; i < priv->num_rx_q; i++) {
+		struct hfi1_netdev_rxq *rxq = &priv->rxq[i];
+
+		dd_dev_info(priv->dd, "enabling queue %d on context %d\n", i,
+			    rxq->rcd->ctxt);
+		napi_enable(&rxq->napi);
+		hfi1_rcvctrl(priv->dd,
+			     HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB,
+			     rxq->rcd);
+	}
+}
+
+static void disable_queues(struct hfi1_netdev_priv *priv)
+{
+	int i;
+
+	msix_netdev_synchronize_irq(priv->dd);
+
+	for (i = 0; i < priv->num_rx_q; i++) {
+		struct hfi1_netdev_rxq *rxq = &priv->rxq[i];
+
+		dd_dev_info(priv->dd, "disabling queue %d on context %d\n", i,
+			    rxq->rcd->ctxt);
+
+		/* wait for napi if it was scheduled */
+		hfi1_rcvctrl(priv->dd,
+			     HFI1_RCVCTRL_CTXT_DIS | HFI1_RCVCTRL_INTRAVAIL_DIS,
+			     rxq->rcd);
+		napi_synchronize(&rxq->napi);
+		napi_disable(&rxq->napi);
+	}
+}
+
+/**
+ * hfi1_netdev_rx_init - Incrememnts netdevs counter. When called first time,
+ * it allocates receive queue data and calls netif_napi_add
+ * for each queue.
+ *
+ * @dd: hfi1 dev data
+ */
+int hfi1_netdev_rx_init(struct hfi1_devdata *dd)
+{
+	struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+	int res;
+
+	if (atomic_fetch_inc(&priv->netdevs))
+		return 0;
+
+	mutex_lock(&hfi1_mutex);
+	init_dummy_netdev(dd->dummy_netdev);
+	res = hfi1_netdev_rxq_init(dd->dummy_netdev);
+	mutex_unlock(&hfi1_mutex);
+	return res;
+}
+
+/**
+ * hfi1_netdev_rx_destroy - Decrements netdevs counter, when it reaches 0
+ * napi is deleted and receive queses memory is freed.
+ *
+ * @dd: hfi1 dev data
+ */
+int hfi1_netdev_rx_destroy(struct hfi1_devdata *dd)
+{
+	struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+
+	/* destroy the RX queues only if it is the last netdev going away */
+	if (atomic_fetch_add_unless(&priv->netdevs, -1, 0) == 1) {
+		mutex_lock(&hfi1_mutex);
+		hfi1_netdev_rxq_deinit(dd->dummy_netdev);
+		mutex_unlock(&hfi1_mutex);
+	}
+
+	return 0;
+}
+
+/**
+ * hfi1_netdev_alloc - Allocates netdev and private data. It is required
+ * because RMT index and MSI-X interrupt can be set only
+ * during driver initialization.
+ *
+ * @dd: hfi1 dev data
+ */
+int hfi1_netdev_alloc(struct hfi1_devdata *dd)
+{
+	struct hfi1_netdev_priv *priv;
+	const int netdev_size = sizeof(*dd->dummy_netdev) +
+		sizeof(struct hfi1_netdev_priv);
+
+	dd_dev_info(dd, "allocating netdev size %d\n", netdev_size);
+	dd->dummy_netdev = kcalloc_node(1, netdev_size, GFP_KERNEL, dd->node);
+
+	if (!dd->dummy_netdev)
+		return -ENOMEM;
+
+	priv = hfi1_netdev_priv(dd->dummy_netdev);
+	priv->dd = dd;
+	xa_init(&priv->dev_tbl);
+	atomic_set(&priv->enabled, 0);
+	atomic_set(&priv->netdevs, 0);
+
+	return 0;
+}
+
+void hfi1_netdev_free(struct hfi1_devdata *dd)
+{
+	if (dd->dummy_netdev) {
+		dd_dev_info(dd, "hfi1 netdev freed\n");
+		kfree(dd->dummy_netdev);
+		dd->dummy_netdev = NULL;
+	}
+}
+
+/**
+ * hfi1_netdev_enable_queues - This is napi enable function.
+ * It enables napi objects associated with queues.
+ * When at least one device has called it it increments atomic counter.
+ * Disable function decrements counter and when it is 0,
+ * calls napi_disable for every queue.
+ *
+ * @dd: hfi1 dev data
+ */
+void hfi1_netdev_enable_queues(struct hfi1_devdata *dd)
+{
+	struct hfi1_netdev_priv *priv;
+
+	if (!dd->dummy_netdev)
+		return;
+
+	priv = hfi1_netdev_priv(dd->dummy_netdev);
+	if (atomic_fetch_inc(&priv->enabled))
+		return;
+
+	mutex_lock(&hfi1_mutex);
+	enable_queues(priv);
+	mutex_unlock(&hfi1_mutex);
+}
+
+void hfi1_netdev_disable_queues(struct hfi1_devdata *dd)
+{
+	struct hfi1_netdev_priv *priv;
+
+	if (!dd->dummy_netdev)
+		return;
+
+	priv = hfi1_netdev_priv(dd->dummy_netdev);
+	if (atomic_dec_if_positive(&priv->enabled))
+		return;
+
+	mutex_lock(&hfi1_mutex);
+	disable_queues(priv);
+	mutex_unlock(&hfi1_mutex);
+}
+
+/**
+ * hfi1_netdev_add_data - Registers data with unique identifier
+ * to be requested later this is needed for VNIC and IPoIB VLANs
+ * implementations.
+ * This call is protected by mutex idr_lock.
+ *
+ * @dd: hfi1 dev data
+ * @id: requested integer id up to INT_MAX
+ * @data: data to be associated with index
+ */
+int hfi1_netdev_add_data(struct hfi1_devdata *dd, int id, void *data)
+{
+	struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+
+	return xa_insert(&priv->dev_tbl, id, data, GFP_NOWAIT);
+}
+
+/**
+ * hfi1_netdev_remove_data - Removes data with previously given id.
+ * Returns the reference to removed entry.
+ *
+ * @dd: hfi1 dev data
+ * @id: requested integer id up to INT_MAX
+ */
+void *hfi1_netdev_remove_data(struct hfi1_devdata *dd, int id)
+{
+	struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+
+	return xa_erase(&priv->dev_tbl, id);
+}
+
+/**
+ * hfi1_netdev_get_data - Gets data with given id
+ *
+ * @dd: hfi1 dev data
+ * @id: requested integer id up to INT_MAX
+ */
+void *hfi1_netdev_get_data(struct hfi1_devdata *dd, int id)
+{
+	struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+
+	return xa_load(&priv->dev_tbl, id);
+}
+
+/**
+ * hfi1_netdev_get_first_dat - Gets first entry with greater or equal id.
+ *
+ * @dd: hfi1 dev data
+ * @id: requested integer id up to INT_MAX
+ */
+void *hfi1_netdev_get_first_data(struct hfi1_devdata *dd, int *start_id)
+{
+	struct hfi1_netdev_priv *priv = hfi1_netdev_priv(dd->dummy_netdev);
+	unsigned long index = *start_id;
+	void *ret;
+
+	ret = xa_find(&priv->dev_tbl, &index, UINT_MAX, XA_PRESENT);
+	*start_id = (int)index;
+	return ret;
+}
diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c
index 61362bd..18d32f0 100644
--- a/drivers/infiniband/hw/hfi1/pcie.c
+++ b/drivers/infiniband/hw/hfi1/pcie.c
@@ -161,7 +161,7 @@
 		return -EINVAL;
 	}
 
-	dd->kregbase1 = ioremap_nocache(addr, RCV_ARRAY);
+	dd->kregbase1 = ioremap(addr, RCV_ARRAY);
 	if (!dd->kregbase1) {
 		dd_dev_err(dd, "UC mapping of kregbase1 failed\n");
 		return -ENOMEM;
@@ -179,7 +179,7 @@
 	dd_dev_info(dd, "RcvArray count: %u\n", rcv_array_count);
 	dd->base2_start  = RCV_ARRAY + rcv_array_count * 8;
 
-	dd->kregbase2 = ioremap_nocache(
+	dd->kregbase2 = ioremap(
 		addr + dd->base2_start,
 		TXE_PIO_SEND - dd->base2_start);
 	if (!dd->kregbase2) {
@@ -306,7 +306,7 @@
 	ret = pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap);
 	if (ret) {
 		dd_dev_err(dd, "Unable to read from PCI config\n");
-		return ret;
+		return pcibios_err_to_errno(ret);
 	}
 
 	if ((linkcap & PCI_EXP_LNKCAP_SLS) != PCI_EXP_LNKCAP_SLS_8_0GB) {
@@ -334,10 +334,14 @@
 	return 0;
 }
 
-/* restore command and BARs after a reset has wiped them out */
+/**
+ * Restore command and BARs after a reset has wiped them out
+ *
+ * Returns 0 on success, otherwise a negative error value
+ */
 int restore_pci_variables(struct hfi1_devdata *dd)
 {
-	int ret = 0;
+	int ret;
 
 	ret = pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command);
 	if (ret)
@@ -386,13 +390,17 @@
 
 error:
 	dd_dev_err(dd, "Unable to write to PCI config\n");
-	return ret;
+	return pcibios_err_to_errno(ret);
 }
 
-/* Save BARs and command to rewrite after device reset */
+/**
+ * Save BARs and command to rewrite after device reset
+ *
+ * Returns 0 on success, otherwise a negative error value
+ */
 int save_pci_variables(struct hfi1_devdata *dd)
 {
-	int ret = 0;
+	int ret;
 
 	ret = pci_read_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0,
 				    &dd->pcibar0);
@@ -441,7 +449,7 @@
 
 error:
 	dd_dev_err(dd, "Unable to read from PCI config\n");
-	return ret;
+	return pcibios_err_to_errno(ret);
 }
 
 /*
diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c
index 79126b2..1cd8f80 100644
--- a/drivers/infiniband/hw/hfi1/pio.c
+++ b/drivers/infiniband/hw/hfi1/pio.c
@@ -86,7 +86,7 @@
 	switch (op) {
 	case PSC_GLOBAL_ENABLE:
 		reg |= SEND_CTRL_SEND_ENABLE_SMASK;
-	/* Fall through */
+		fallthrough;
 	case PSC_DATA_VL_ENABLE:
 		mask = 0;
 		for (i = 0; i < ARRAY_SIZE(dd->vld); i++)
@@ -920,6 +920,7 @@
 {
 	u64 reg;
 	struct pio_buf *pbuf;
+	LIST_HEAD(wake_list);
 
 	if (!sc)
 		return;
@@ -954,19 +955,21 @@
 	spin_unlock(&sc->release_lock);
 
 	write_seqlock(&sc->waitlock);
-	while (!list_empty(&sc->piowait)) {
+	if (!list_empty(&sc->piowait))
+		list_move(&sc->piowait, &wake_list);
+	write_sequnlock(&sc->waitlock);
+	while (!list_empty(&wake_list)) {
 		struct iowait *wait;
 		struct rvt_qp *qp;
 		struct hfi1_qp_priv *priv;
 
-		wait = list_first_entry(&sc->piowait, struct iowait, list);
+		wait = list_first_entry(&wake_list, struct iowait, list);
 		qp = iowait_to_qp(wait);
 		priv = qp->priv;
 		list_del_init(&priv->s_iowait.list);
 		priv->s_iowait.lock = NULL;
 		hfi1_qp_wakeup(qp, RVT_S_WAIT_PIO | HFI1_S_WAIT_PIO_DRAIN);
 	}
-	write_sequnlock(&sc->waitlock);
 
 	spin_unlock_irq(&sc->alloc_lock);
 }
diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h
index c9a58b6..0102262 100644
--- a/drivers/infiniband/hw/hfi1/pio.h
+++ b/drivers/infiniband/hw/hfi1/pio.h
@@ -243,7 +243,7 @@
  */
 struct pio_map_elem {
 	u32 mask;
-	struct send_context *ksc[0];
+	struct send_context *ksc[];
 };
 
 /*
@@ -263,7 +263,7 @@
 	u32 mask;
 	u8 actual_vls;
 	u8 vls;
-	struct pio_map_elem *map[0];
+	struct pio_map_elem *map[];
 };
 
 int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls,
diff --git a/drivers/infiniband/hw/hfi1/pio_copy.c b/drivers/infiniband/hw/hfi1/pio_copy.c
index 03024ce..4a4ec23 100644
--- a/drivers/infiniband/hw/hfi1/pio_copy.c
+++ b/drivers/infiniband/hw/hfi1/pio_copy.c
@@ -191,25 +191,24 @@
 	switch (n) {
 	case 7:
 		*dest++ = *src++;
-		/* fall through */
+		fallthrough;
 	case 6:
 		*dest++ = *src++;
-		/* fall through */
+		fallthrough;
 	case 5:
 		*dest++ = *src++;
-		/* fall through */
+		fallthrough;
 	case 4:
 		*dest++ = *src++;
-		/* fall through */
+		fallthrough;
 	case 3:
 		*dest++ = *src++;
-		/* fall through */
+		fallthrough;
 	case 2:
 		*dest++ = *src++;
-		/* fall through */
+		fallthrough;
 	case 1:
 		*dest++ = *src++;
-		/* fall through */
 	}
 }
 
diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c
index cbf7faa..4642d6c 100644
--- a/drivers/infiniband/hw/hfi1/platform.c
+++ b/drivers/infiniband/hw/hfi1/platform.c
@@ -634,7 +634,7 @@
 			   u32 config_data, const char *message)
 {
 	u8 i;
-	int ret = HCMD_SUCCESS;
+	int ret;
 
 	for (i = 0; i < 4; i++) {
 		ret = load_8051_config(ppd->dd, field_id, i, config_data);
@@ -668,8 +668,8 @@
 
 	/* active optical cables only */
 	switch ((cache[QSFP_MOD_TECH_OFFS] & 0xF0) >> 4) {
-	case 0x0 ... 0x9: /* fallthrough */
-	case 0xC: /* fallthrough */
+	case 0x0 ... 0x9: fallthrough;
+	case 0xC: fallthrough;
 	case 0xE:
 		/* active AOC */
 		power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]);
@@ -899,8 +899,8 @@
 
 		*ptr_tuning_method = OPA_PASSIVE_TUNING;
 		break;
-	case 0x0 ... 0x9: /* fallthrough */
-	case 0xC: /* fallthrough */
+	case 0x0 ... 0x9: fallthrough;
+	case 0xC: fallthrough;
 	case 0xE:
 		ret = tune_active_qsfp(ppd, ptr_tx_preset, ptr_rx_preset,
 				       ptr_total_atten);
@@ -909,7 +909,7 @@
 
 		*ptr_tuning_method = OPA_ACTIVE_TUNING;
 		break;
-	case 0xD: /* fallthrough */
+	case 0xD: fallthrough;
 	case 0xF:
 	default:
 		dd_dev_warn(ppd->dd, "%s: Unknown/unsupported cable\n",
diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
index acd4400..356518e 100644
--- a/drivers/infiniband/hw/hfi1/qp.c
+++ b/drivers/infiniband/hw/hfi1/qp.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2019 Intel Corporation.
+ * Copyright(c) 2015 - 2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -186,15 +186,6 @@
 	write_sequnlock_irqrestore(lock, flags);
 }
 
-static inline int opa_mtu_enum_to_int(int mtu)
-{
-	switch (mtu) {
-	case OPA_MTU_8192:  return 8192;
-	case OPA_MTU_10240: return 10240;
-	default:            return -1;
-	}
-}
-
 /**
  * This function is what we would push to the core layer if we wanted to be a
  * "first class citizen".  Instead we hide this here and rely on Verbs ULPs
@@ -202,15 +193,10 @@
  */
 static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu)
 {
-	int val;
-
 	/* Constraining 10KB packets to 8KB packets */
 	if (mtu == (enum ib_mtu)OPA_MTU_10240)
-		mtu = OPA_MTU_8192;
-	val = opa_mtu_enum_to_int((int)mtu);
-	if (val > 0)
-		return val;
-	return ib_mtu_enum_to_int(mtu);
+		mtu = (enum ib_mtu)OPA_MTU_8192;
+	return opa_mtu_enum_to_int((enum opa_mtu)mtu);
 }
 
 int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr,
@@ -326,7 +312,7 @@
 	switch (qp->ibqp.qp_type) {
 	case IB_QPT_RC:
 		hfi1_setup_tid_rdma_wqe(qp, wqe);
-		/* fall through */
+		fallthrough;
 	case IB_QPT_UC:
 		if (wqe->length > 0x80000000U)
 			return -EINVAL;
diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h
index b670321..b0d053d 100644
--- a/drivers/infiniband/hw/hfi1/qp.h
+++ b/drivers/infiniband/hw/hfi1/qp.h
@@ -113,20 +113,6 @@
 }
 
 /**
- * hfi1_create_qp - create a queue pair for a device
- * @ibpd: the protection domain who's device we create the queue pair for
- * @init_attr: the attributes of the queue pair
- * @udata: user data for libibverbs.so
- *
- * Returns the queue pair on success, otherwise returns an errno.
- *
- * Called by the ib_create_qp() core verbs function.
- */
-struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd,
-			     struct ib_qp_init_attr *init_attr,
-			     struct ib_udata *udata);
-
-/**
  * hfi1_qp_wakeup - wake up on the indicated event
  * @qp: the QP
  * @flag: flag the qp on which the qp is stalled
diff --git a/drivers/infiniband/hw/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c
index b596699..8386c84 100644
--- a/drivers/infiniband/hw/hfi1/qsfp.c
+++ b/drivers/infiniband/hw/hfi1/qsfp.c
@@ -231,7 +231,7 @@
 		break;
 	case 2:
 		offset_bytes[1] = (offset >> 8) & 0xff;
-		/* fall through */
+		fallthrough;
 	case 1:
 		num_msgs = 2;
 		offset_bytes[0] = offset & 0xff;
@@ -279,7 +279,7 @@
 		break;
 	case 2:
 		offset_bytes[1] = (offset >> 8) & 0xff;
-		/* fall through */
+		fallthrough;
 	case 1:
 		num_msgs = 2;
 		offset_bytes[0] = offset & 0xff;
diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c
index 1a3c647..1bb5f57 100644
--- a/drivers/infiniband/hw/hfi1/rc.c
+++ b/drivers/infiniband/hw/hfi1/rc.c
@@ -141,7 +141,7 @@
 	case OP(RDMA_READ_RESPONSE_ONLY):
 		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
 		release_rdma_sge_mr(e);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(ATOMIC_ACKNOWLEDGE):
 		/*
 		 * We can increment the tail pointer now that the last
@@ -160,7 +160,7 @@
 			qp->s_acked_ack_queue = next;
 		qp->s_tail_ack_queue = next;
 		trace_hfi1_rsp_make_rc_ack(qp, e->psn);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_ONLY):
 	case OP(ACKNOWLEDGE):
 		/* Check for no next entry in the queue. */
@@ -267,7 +267,7 @@
 
 	case OP(RDMA_READ_RESPONSE_FIRST):
 		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(RDMA_READ_RESPONSE_MIDDLE):
 		ps->s_txreq->ss = &qp->s_ack_rdma_sge;
 		ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr;
@@ -881,8 +881,7 @@
 				goto bail;
 			}
 			qp->s_num_rd_atomic++;
-
-			/* FALLTHROUGH */
+			fallthrough;
 		case IB_WR_OPFN:
 			if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT))
 				qp->s_lsn++;
@@ -946,10 +945,10 @@
 		 * See restart_rc().
 		 */
 		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_FIRST):
 		qp->s_state = OP(SEND_MIDDLE);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_MIDDLE):
 		bth2 = mask_psn(qp->s_psn++);
 		ss = &qp->s_sge;
@@ -991,10 +990,10 @@
 		 * See restart_rc().
 		 */
 		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(RDMA_WRITE_FIRST):
 		qp->s_state = OP(RDMA_WRITE_MIDDLE);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(RDMA_WRITE_MIDDLE):
 		bth2 = mask_psn(qp->s_psn++);
 		ss = &qp->s_sge;
@@ -2599,7 +2598,7 @@
 	 * to be sent before sending this one.
 	 */
 	e = NULL;
-	old_req = 1;
+	old_req = true;
 	ibp->rvp.n_rc_dupreq++;
 
 	spin_lock_irqsave(&qp->s_lock, flags);
@@ -2901,7 +2900,7 @@
 		if (!ret)
 			goto rnr_nak;
 		qp->r_rcv_len = 0;
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_MIDDLE):
 	case OP(RDMA_WRITE_MIDDLE):
 send_middle:
@@ -2941,7 +2940,7 @@
 			goto no_immediate_data;
 		if (opcode == OP(SEND_ONLY_WITH_INVALIDATE))
 			goto send_last_inv;
-		/* FALLTHROUGH -- for SEND_ONLY_WITH_IMMEDIATE */
+		fallthrough;	/* for SEND_ONLY_WITH_IMMEDIATE */
 	case OP(SEND_LAST_WITH_IMMEDIATE):
 send_last_imm:
 		wc.ex.imm_data = ohdr->u.imm_data;
@@ -2957,7 +2956,7 @@
 		goto send_last;
 	case OP(RDMA_WRITE_LAST):
 		copy_last = rvt_is_user_qp(qp);
-		/* fall through */
+		fallthrough;
 	case OP(SEND_LAST):
 no_immediate_data:
 		wc.wc_flags = 0;
@@ -3010,7 +3009,7 @@
 
 	case OP(RDMA_WRITE_ONLY):
 		copy_last = rvt_is_user_qp(qp);
-		/* fall through */
+		fallthrough;
 	case OP(RDMA_WRITE_FIRST):
 	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 248be21..0b73dc7 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -232,11 +232,11 @@
 static void sdma_complete(struct kref *);
 static void sdma_finalput(struct sdma_state *);
 static void sdma_get(struct sdma_state *);
-static void sdma_hw_clean_up_task(unsigned long);
+static void sdma_hw_clean_up_task(struct tasklet_struct *);
 static void sdma_put(struct sdma_state *);
 static void sdma_set_state(struct sdma_engine *, enum sdma_states);
 static void sdma_start_hw_clean_up(struct sdma_engine *);
-static void sdma_sw_clean_up_task(unsigned long);
+static void sdma_sw_clean_up_task(struct tasklet_struct *);
 static void sdma_sendctrl(struct sdma_engine *, unsigned);
 static void init_sdma_regs(struct sdma_engine *, u32, uint);
 static void sdma_process_event(
@@ -545,9 +545,10 @@
 	schedule_work(&sde->err_halt_worker);
 }
 
-static void sdma_hw_clean_up_task(unsigned long opaque)
+static void sdma_hw_clean_up_task(struct tasklet_struct *t)
 {
-	struct sdma_engine *sde = (struct sdma_engine *)opaque;
+	struct sdma_engine *sde = from_tasklet(sde, t,
+					       sdma_hw_clean_up_task);
 	u64 statuscsr;
 
 	while (1) {
@@ -604,9 +605,9 @@
 		sdma_desc_avail(sde, sdma_descq_freecnt(sde));
 }
 
-static void sdma_sw_clean_up_task(unsigned long opaque)
+static void sdma_sw_clean_up_task(struct tasklet_struct *t)
 {
-	struct sdma_engine *sde = (struct sdma_engine *)opaque;
+	struct sdma_engine *sde = from_tasklet(sde, t, sdma_sw_clean_up_task);
 	unsigned long flags;
 
 	spin_lock_irqsave(&sde->tail_lock, flags);
@@ -833,7 +834,7 @@
 struct sdma_rht_map_elem {
 	u32 mask;
 	u8 ctr;
-	struct sdma_engine *sde[0];
+	struct sdma_engine *sde[];
 };
 
 struct sdma_rht_node {
@@ -848,7 +849,7 @@
 	.nelem_hint = NR_CPUS_HINT,
 	.head_offset = offsetof(struct sdma_rht_node, node),
 	.key_offset = offsetof(struct sdma_rht_node, cpu_id),
-	.key_len = FIELD_SIZEOF(struct sdma_rht_node, cpu_id),
+	.key_len = sizeof_field(struct sdma_rht_node, cpu_id),
 	.max_size = NR_CPUS,
 	.min_size = 8,
 	.automatic_shrinking = true,
@@ -879,10 +880,10 @@
 	if (current->nr_cpus_allowed != 1)
 		goto out;
 
-	cpu_id = smp_processor_id();
 	rcu_read_lock();
-	rht_node = rhashtable_lookup_fast(dd->sdma_rht, &cpu_id,
-					  sdma_rht_params);
+	cpu_id = smp_processor_id();
+	rht_node = rhashtable_lookup(dd->sdma_rht, &cpu_id,
+				     sdma_rht_params);
 
 	if (rht_node && rht_node->map[vl]) {
 		struct sdma_rht_map_elem *map = rht_node->map[vl];
@@ -1454,11 +1455,10 @@
 		sde->tail_csr =
 			get_kctxt_csr_addr(dd, this_idx, SD(TAIL));
 
-		tasklet_init(&sde->sdma_hw_clean_up_task, sdma_hw_clean_up_task,
-			     (unsigned long)sde);
-
-		tasklet_init(&sde->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
-			     (unsigned long)sde);
+		tasklet_setup(&sde->sdma_hw_clean_up_task,
+			      sdma_hw_clean_up_task);
+		tasklet_setup(&sde->sdma_sw_clean_up_task,
+			      sdma_sw_clean_up_task);
 		INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait);
 		INIT_WORK(&sde->flush_worker, sdma_field_flush);
 
@@ -2584,7 +2584,7 @@
 			 * 7220, e.g.
 			 */
 			ss->go_s99_running = 1;
-			/* fall through -- and start dma engine */
+			fallthrough;	/* and start dma engine */
 		case sdma_event_e10_go_hw_start:
 			/* This reference means the state machine is started */
 			sdma_get(&sde->state);
@@ -2726,7 +2726,6 @@
 		case sdma_event_e70_go_idle:
 			break;
 		case sdma_event_e85_link_down:
-			/* fall through */
 		case sdma_event_e80_hw_freeze:
 			sdma_set_state(sde, sdma_state_s80_hw_freeze);
 			atomic_dec(&sde->dd->sdma_unfreeze_count);
@@ -3007,7 +3006,7 @@
 		case sdma_event_e60_hw_halted:
 			need_progress = 1;
 			sdma_err_progress_check_schedule(sde);
-			/* fall through */
+			fallthrough;
 		case sdma_event_e90_sw_halted:
 			/*
 			* SW initiated halt does not perform engines
@@ -3021,7 +3020,7 @@
 			break;
 		case sdma_event_e85_link_down:
 			ss->go_s99_running = 0;
-			/* fall through */
+			fallthrough;
 		case sdma_event_e80_hw_freeze:
 			sdma_set_state(sde, sdma_state_s80_hw_freeze);
 			atomic_dec(&sde->dd->sdma_unfreeze_count);
@@ -3251,7 +3250,7 @@
 		tx->num_desc++;
 		tx->descs[2].qw[0] = 0;
 		tx->descs[2].qw[1] = 0;
-		/* FALLTHROUGH */
+		fallthrough;
 	case SDMA_AHG_APPLY_UPDATE2:
 		tx->num_desc++;
 		tx->descs[1].qw[0] = 0;
diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h
index 1e2e40f..7a85119 100644
--- a/drivers/infiniband/hw/hfi1/sdma.h
+++ b/drivers/infiniband/hw/hfi1/sdma.h
@@ -1002,7 +1002,7 @@
  */
 struct sdma_map_elem {
 	u32 mask;
-	struct sdma_engine *sde[0];
+	struct sdma_engine *sde[];
 };
 
 /**
@@ -1024,7 +1024,7 @@
 	u32 mask;
 	u8 actual_vls;
 	u8 vls;
-	struct sdma_map_elem *map[0];
+	struct sdma_map_elem *map[];
 };
 
 int sdma_map_init(
diff --git a/drivers/infiniband/hw/hfi1/tid_rdma.c b/drivers/infiniband/hw/hfi1/tid_rdma.c
index c018fc6..73d197e 100644
--- a/drivers/infiniband/hw/hfi1/tid_rdma.c
+++ b/drivers/infiniband/hw/hfi1/tid_rdma.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
 /*
- * Copyright(c) 2018 Intel Corporation.
+ * Copyright(c) 2018 - 2020 Intel Corporation.
  *
  */
 
@@ -194,7 +194,7 @@
 {
 	struct hfi1_qp_priv *priv = qp->priv;
 
-	p->qp = (kdeth_qp << 16) | priv->rcd->ctxt;
+	p->qp = (RVT_KDETH_QP_PREFIX << 16) | priv->rcd->ctxt;
 	p->max_len = TID_RDMA_MAX_SEGMENT_SIZE;
 	p->jkey = priv->rcd->jkey;
 	p->max_read = TID_RDMA_MAX_READ_SEGS_PER_REQ;
@@ -3228,7 +3228,7 @@
 	case IB_WR_RDMA_READ:
 		if (prev->wr.opcode != IB_WR_TID_RDMA_WRITE)
 			break;
-		/* fall through */
+		fallthrough;
 	case IB_WR_TID_RDMA_READ:
 		switch (prev->wr.opcode) {
 		case IB_WR_RDMA_READ:
@@ -5068,7 +5068,7 @@
 		if (priv->s_state == TID_OP(WRITE_REQ))
 			hfi1_tid_rdma_restart_req(qp, wqe, &bth2);
 		priv->s_state = TID_OP(WRITE_DATA);
-		/* fall through */
+		fallthrough;
 
 	case TID_OP(WRITE_DATA):
 		/*
diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c
index 9a3d236..b219ea9 100644
--- a/drivers/infiniband/hw/hfi1/trace.c
+++ b/drivers/infiniband/hw/hfi1/trace.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -47,6 +47,7 @@
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 #include "exp_rcv.h"
+#include "ipoib.h"
 
 static u8 __get_ib_hdr_len(struct ib_header *hdr)
 {
@@ -126,6 +127,7 @@
 #define RETH_PRN "reth vaddr:0x%.16llx rkey:0x%.8x dlen:0x%.8x"
 #define AETH_PRN "aeth syn:0x%.2x %s msn:0x%.8x"
 #define DETH_PRN "deth qkey:0x%.8x sqpn:0x%.6x"
+#define DETH_ENTROPY_PRN "deth qkey:0x%.8x sqpn:0x%.6x entropy:0x%.2x"
 #define IETH_PRN "ieth rkey:0x%.8x"
 #define ATOMICACKETH_PRN "origdata:%llx"
 #define ATOMICETH_PRN "vaddr:0x%llx rkey:0x%.8x sdata:%llx cdata:%llx"
@@ -444,6 +446,12 @@
 		break;
 	/* deth */
 	case OP(UD, SEND_ONLY):
+		trace_seq_printf(p, DETH_ENTROPY_PRN,
+				 be32_to_cpu(eh->ud.deth[0]),
+				 be32_to_cpu(eh->ud.deth[1]) & RVT_QPN_MASK,
+				 be32_to_cpu(eh->ud.deth[1]) >>
+					     HFI1_IPOIB_ENTROPY_SHIFT);
+		break;
 	case OP(UD, SEND_ONLY_WITH_IMMEDIATE):
 		trace_seq_printf(p, DETH_PRN,
 				 be32_to_cpu(eh->ud.deth[0]),
@@ -512,6 +520,38 @@
 	return EXP_TID_GET(ent, IDX);
 }
 
+struct hfi1_ctxt_hist {
+	atomic_t count;
+	atomic_t data[255];
+};
+
+struct hfi1_ctxt_hist hist = {
+	.count = ATOMIC_INIT(0)
+};
+
+const char *hfi1_trace_print_rsm_hist(struct trace_seq *p, unsigned int ctxt)
+{
+	int i, len = ARRAY_SIZE(hist.data);
+	const char *ret = trace_seq_buffer_ptr(p);
+	unsigned long packet_count = atomic_fetch_inc(&hist.count);
+
+	trace_seq_printf(p, "packet[%lu]", packet_count);
+	for (i = 0; i < len; ++i) {
+		unsigned long val;
+		atomic_t *count = &hist.data[i];
+
+		if (ctxt == i)
+			val = atomic_fetch_inc(count);
+		else
+			val = atomic_read(count);
+
+		if (val)
+			trace_seq_printf(p, "(%d:%lu)", i, val);
+	}
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
 __hfi1_trace_fn(AFFINITY);
 __hfi1_trace_fn(PKT);
 __hfi1_trace_fn(PROC);
diff --git a/drivers/infiniband/hw/hfi1/trace_ctxts.h b/drivers/infiniband/hw/hfi1/trace_ctxts.h
index e00c8a7..d8c168d 100644
--- a/drivers/infiniband/hw/hfi1/trace_ctxts.h
+++ b/drivers/infiniband/hw/hfi1/trace_ctxts.h
@@ -1,5 +1,5 @@
 /*
-* Copyright(c) 2015, 2016 Intel Corporation.
+* Copyright(c) 2015 - 2020 Intel Corporation.
 *
 * This file is provided under a dual BSD/GPLv2 license.  When using or
 * redistributing this file, you may do so under either license.
@@ -80,7 +80,7 @@
 			   __entry->credits = uctxt->sc->credits;
 			   __entry->hw_free = le64_to_cpu(*uctxt->sc->hw_free);
 			   __entry->piobase = uctxt->sc->base_addr;
-			   __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
+			   __entry->rcvhdrq_cnt = get_hdrq_cnt(uctxt);
 			   __entry->rcvhdrq_dma = uctxt->rcvhdrq_dma;
 			   __entry->eager_cnt = uctxt->egrbufs.alloced;
 			   __entry->rcvegr_dma = uctxt->egrbufs.rcvtids[0].dma;
@@ -138,6 +138,15 @@
 		      )
 );
 
+const char *hfi1_trace_print_rsm_hist(struct trace_seq *p, unsigned int ctxt);
+TRACE_EVENT(ctxt_rsm_hist,
+	    TP_PROTO(unsigned int ctxt),
+	    TP_ARGS(ctxt),
+	    TP_STRUCT__entry(__field(unsigned int, ctxt)),
+	    TP_fast_assign(__entry->ctxt = ctxt;),
+	    TP_printk("%s", hfi1_trace_print_rsm_hist(p, __entry->ctxt))
+);
+
 #endif /* __HFI1_TRACE_CTXTS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/drivers/infiniband/hw/hfi1/trace_rx.h b/drivers/infiniband/hw/hfi1/trace_rx.h
index 3cec960..168079e 100644
--- a/drivers/infiniband/hw/hfi1/trace_rx.h
+++ b/drivers/infiniband/hw/hfi1/trace_rx.h
@@ -106,19 +106,8 @@
 			     ),
 	    TP_fast_assign(DD_DEV_ASSIGN(dd);
 			__entry->ctxt = rcd->ctxt;
-			if (rcd->do_interrupt ==
-			    &handle_receive_interrupt) {
-				__entry->slow_path = 1;
-				__entry->dma_rtail = 0xFF;
-			} else if (rcd->do_interrupt ==
-					&handle_receive_interrupt_dma_rtail){
-				__entry->dma_rtail = 1;
-				__entry->slow_path = 0;
-			} else if (rcd->do_interrupt ==
-					&handle_receive_interrupt_nodma_rtail) {
-				__entry->dma_rtail = 0;
-				__entry->slow_path = 0;
-			}
+			__entry->slow_path = hfi1_is_slowpath(rcd);
+			__entry->dma_rtail = get_dma_rtail_setting(rcd);
 			),
 	    TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d",
 		      __get_str(dev),
diff --git a/drivers/infiniband/hw/hfi1/trace_tid.h b/drivers/infiniband/hw/hfi1/trace_tid.h
index 343fb98..985ffa9 100644
--- a/drivers/infiniband/hw/hfi1/trace_tid.h
+++ b/drivers/infiniband/hw/hfi1/trace_tid.h
@@ -138,10 +138,10 @@
 	TP_ARGS(dd, index, type, pa, order),
 	TP_STRUCT__entry(/* entry */
 		DD_DEV_ENTRY(dd)
-		__field(unsigned long, pa);
-		__field(u32, index);
-		__field(u32, type);
-		__field(u16, order);
+		__field(unsigned long, pa)
+		__field(u32, index)
+		__field(u32, type)
+		__field(u16, order)
 	),
 	TP_fast_assign(/* assign */
 		DD_DEV_ASSIGN(dd);
diff --git a/drivers/infiniband/hw/hfi1/trace_tx.h b/drivers/infiniband/hw/hfi1/trace_tx.h
index 09eb0c9..769e5e4 100644
--- a/drivers/infiniband/hw/hfi1/trace_tx.h
+++ b/drivers/infiniband/hw/hfi1/trace_tx.h
@@ -588,7 +588,7 @@
 	    TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i),
 	    TP_ARGS(dd, ctxt, subctxt, i),
 	    TP_STRUCT__entry(
-		    DD_DEV_ENTRY(dd);
+		    DD_DEV_ENTRY(dd)
 		    __field(u16, ctxt)
 		    __field(u8, subctxt)
 		    __field(u8, ver_opcode)
diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c
index 0c77f18..1fb9183 100644
--- a/drivers/infiniband/hw/hfi1/uc.c
+++ b/drivers/infiniband/hw/hfi1/uc.c
@@ -216,7 +216,7 @@
 
 	case OP(SEND_FIRST):
 		qp->s_state = OP(SEND_MIDDLE);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_MIDDLE):
 		len = qp->s_len;
 		if (len > pmtu) {
@@ -241,7 +241,7 @@
 
 	case OP(RDMA_WRITE_FIRST):
 		qp->s_state = OP(RDMA_WRITE_MIDDLE);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(RDMA_WRITE_MIDDLE):
 		len = qp->s_len;
 		if (len > pmtu) {
@@ -414,7 +414,7 @@
 			goto no_immediate_data;
 		else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
 			goto send_last_imm;
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_MIDDLE):
 		/* Check for invalid length PMTU or posted rwqe len. */
 		/*
@@ -515,7 +515,7 @@
 			wc.ex.imm_data = ohdr->u.rc.imm_data;
 			goto rdma_last_imm;
 		}
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(RDMA_WRITE_MIDDLE):
 		/* Check for invalid length PMTU or posted rwqe len. */
 		if (unlikely(tlen != (hdrsize + pmtu + 4)))
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
index 4d73235..b94fc7f 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright(c) 2020 Cornelis Networks, Inc.
  * Copyright(c) 2015-2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
@@ -59,11 +60,11 @@
 			      struct tid_user_buf *tbuf,
 			      u32 rcventry, struct tid_group *grp,
 			      u16 pageidx, unsigned int npages);
-static int tid_rb_insert(void *arg, struct mmu_rb_node *node);
 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
 				    struct tid_rb_node *tnode);
-static void tid_rb_remove(void *arg, struct mmu_rb_node *node);
-static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
+static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
+			      const struct mmu_notifier_range *range,
+			      unsigned long cur_seq);
 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
 			    struct tid_group *grp,
 			    unsigned int start, u16 count,
@@ -73,10 +74,8 @@
 			      struct tid_group **grp);
 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
 
-static struct mmu_rb_ops tid_rb_ops = {
-	.insert = tid_rb_insert,
-	.remove = tid_rb_remove,
-	.invalidate = tid_rb_invalidate
+static const struct mmu_interval_notifier_ops tid_mn_ops = {
+	.invalidate = tid_rb_invalidate,
 };
 
 /*
@@ -87,7 +86,6 @@
 int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
 			   struct hfi1_ctxtdata *uctxt)
 {
-	struct hfi1_devdata *dd = uctxt->dd;
 	int ret = 0;
 
 	fd->entry_to_rb = kcalloc(uctxt->expected_count,
@@ -106,20 +104,7 @@
 			fd->entry_to_rb = NULL;
 			return -ENOMEM;
 		}
-
-		/*
-		 * Register MMU notifier callbacks. If the registration
-		 * fails, continue without TID caching for this context.
-		 */
-		ret = hfi1_mmu_rb_register(fd, fd->mm, &tid_rb_ops,
-					   dd->pport->hfi1_wq,
-					   &fd->handler);
-		if (ret) {
-			dd_dev_info(dd,
-				    "Failed MMU notifier registration %d\n",
-				    ret);
-			ret = 0;
-		}
+		fd->use_mn = true;
 	}
 
 	/*
@@ -136,7 +121,7 @@
 	 * init.
 	 */
 	spin_lock(&fd->tid_lock);
-	if (uctxt->subctxt_cnt && fd->handler) {
+	if (uctxt->subctxt_cnt && fd->use_mn) {
 		u16 remainder;
 
 		fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
@@ -155,20 +140,12 @@
 {
 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
 
-	/*
-	 * The notifier would have been removed when the process'es mm
-	 * was freed.
-	 */
-	if (fd->handler) {
-		hfi1_mmu_rb_unregister(fd->handler);
-	} else {
-		mutex_lock(&uctxt->exp_mutex);
-		if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
-			unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
-		if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
-			unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
-		mutex_unlock(&uctxt->exp_mutex);
-	}
+	mutex_lock(&uctxt->exp_mutex);
+	if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
+		unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
+	if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
+		unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
+	mutex_unlock(&uctxt->exp_mutex);
 
 	kfree(fd->invalid_tids);
 	fd->invalid_tids = NULL;
@@ -197,15 +174,18 @@
 {
 	struct page **pages;
 	struct hfi1_devdata *dd = fd->uctxt->dd;
+	struct mm_struct *mm;
 
 	if (mapped) {
 		pci_unmap_single(dd->pcidev, node->dma_addr,
-				 node->mmu.len, PCI_DMA_FROMDEVICE);
+				 node->npages * PAGE_SIZE, PCI_DMA_FROMDEVICE);
 		pages = &node->pages[idx];
+		mm = mm_from_tid_node(node);
 	} else {
 		pages = &tidbuf->pages[idx];
+		mm = current->mm;
 	}
-	hfi1_release_user_pages(fd->mm, pages, npages, mapped);
+	hfi1_release_user_pages(mm, pages, npages, mapped);
 	fd->tid_n_pinned -= npages;
 }
 
@@ -230,13 +210,6 @@
 		return -EINVAL;
 	}
 
-	/* Verify that access is OK for the user buffer */
-	if (!access_ok((void __user *)vaddr,
-		       npages * PAGE_SIZE)) {
-		dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
-			   (void *)vaddr, npages);
-		return -EFAULT;
-	}
 	/* Allocate the array of struct page pointers needed for pinning */
 	pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
 	if (!pages)
@@ -247,12 +220,12 @@
 	 * pages, accept the amount pinned so far and program only that.
 	 * User space knows how to deal with partially programmed buffers.
 	 */
-	if (!hfi1_can_pin_pages(dd, fd->mm, fd->tid_n_pinned, npages)) {
+	if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) {
 		kfree(pages);
 		return -ENOMEM;
 	}
 
-	pinned = hfi1_acquire_user_pages(fd->mm, vaddr, npages, true, pages);
+	pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages);
 	if (pinned <= 0) {
 		kfree(pages);
 		return pinned;
@@ -776,8 +749,7 @@
 		return -EFAULT;
 	}
 
-	node->mmu.addr = tbuf->vaddr + (pageidx * PAGE_SIZE);
-	node->mmu.len = npages * PAGE_SIZE;
+	node->fdata = fd;
 	node->phys = page_to_phys(pages[0]);
 	node->npages = npages;
 	node->rcventry = rcventry;
@@ -786,23 +758,35 @@
 	node->freed = false;
 	memcpy(node->pages, pages, sizeof(struct page *) * npages);
 
-	if (!fd->handler)
-		ret = tid_rb_insert(fd, &node->mmu);
-	else
-		ret = hfi1_mmu_rb_insert(fd->handler, &node->mmu);
-
-	if (ret) {
-		hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
-			  node->rcventry, node->mmu.addr, node->phys, ret);
-		pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
-				 PCI_DMA_FROMDEVICE);
-		kfree(node);
-		return -EFAULT;
+	if (fd->use_mn) {
+		ret = mmu_interval_notifier_insert(
+			&node->notifier, current->mm,
+			tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
+			&tid_mn_ops);
+		if (ret)
+			goto out_unmap;
+		/*
+		 * FIXME: This is in the wrong order, the notifier should be
+		 * established before the pages are pinned by pin_rcv_pages.
+		 */
+		mmu_interval_read_begin(&node->notifier);
 	}
+	fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
+
 	hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
 	trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
-			       node->mmu.addr, node->phys, phys);
+			       node->notifier.interval_tree.start, node->phys,
+			       phys);
 	return 0;
+
+out_unmap:
+	hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
+		  node->rcventry, node->notifier.interval_tree.start,
+		  node->phys, ret);
+	pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE,
+			 PCI_DMA_FROMDEVICE);
+	kfree(node);
+	return -EFAULT;
 }
 
 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo,
@@ -832,10 +816,9 @@
 	if (grp)
 		*grp = node->grp;
 
-	if (!fd->handler)
-		cacheless_tid_rb_remove(fd, node);
-	else
-		hfi1_mmu_rb_remove(fd->handler, &node->mmu);
+	if (fd->use_mn)
+		mmu_interval_notifier_remove(&node->notifier);
+	cacheless_tid_rb_remove(fd, node);
 
 	return 0;
 }
@@ -846,7 +829,8 @@
 	struct hfi1_devdata *dd = uctxt->dd;
 
 	trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
-				 node->npages, node->mmu.addr, node->phys,
+				 node->npages,
+				 node->notifier.interval_tree.start, node->phys,
 				 node->dma_addr);
 
 	/*
@@ -893,30 +877,29 @@
 				if (!node || node->rcventry != rcventry)
 					continue;
 
+				if (fd->use_mn)
+					mmu_interval_notifier_remove(
+						&node->notifier);
 				cacheless_tid_rb_remove(fd, node);
 			}
 		}
 	}
 }
 
-/*
- * Always return 0 from this function.  A non-zero return indicates that the
- * remove operation will be called and that memory should be unpinned.
- * However, the driver cannot unpin out from under PSM.  Instead, retain the
- * memory (by returning 0) and inform PSM that the memory is going away.  PSM
- * will call back later when it has removed the memory from its list.
- */
-static int tid_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
+static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
+			      const struct mmu_notifier_range *range,
+			      unsigned long cur_seq)
 {
-	struct hfi1_filedata *fdata = arg;
-	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
 	struct tid_rb_node *node =
-		container_of(mnode, struct tid_rb_node, mmu);
+		container_of(mni, struct tid_rb_node, notifier);
+	struct hfi1_filedata *fdata = node->fdata;
+	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
 
 	if (node->freed)
-		return 0;
+		return true;
 
-	trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr,
+	trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
+				 node->notifier.interval_tree.start,
 				 node->rcventry, node->npages, node->dma_addr);
 	node->freed = true;
 
@@ -945,18 +928,7 @@
 		fdata->invalid_tid_idx++;
 	}
 	spin_unlock(&fdata->invalid_lock);
-	return 0;
-}
-
-static int tid_rb_insert(void *arg, struct mmu_rb_node *node)
-{
-	struct hfi1_filedata *fdata = arg;
-	struct tid_rb_node *tnode =
-		container_of(node, struct tid_rb_node, mmu);
-	u32 base = fdata->uctxt->expected_base;
-
-	fdata->entry_to_rb[tnode->rcventry - base] = tnode;
-	return 0;
+	return true;
 }
 
 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
@@ -967,12 +939,3 @@
 	fdata->entry_to_rb[tnode->rcventry - base] = NULL;
 	clear_tid_node(fdata, tnode);
 }
-
-static void tid_rb_remove(void *arg, struct mmu_rb_node *node)
-{
-	struct hfi1_filedata *fdata = arg;
-	struct tid_rb_node *tnode =
-		container_of(node, struct tid_rb_node, mmu);
-
-	cacheless_tid_rb_remove(fdata, tnode);
-}
diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
index 43b105d..d45c7b6 100644
--- a/drivers/infiniband/hw/hfi1/user_exp_rcv.h
+++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h
@@ -1,6 +1,7 @@
 #ifndef _HFI1_USER_EXP_RCV_H
 #define _HFI1_USER_EXP_RCV_H
 /*
+ * Copyright(c) 2020 - Cornelis Networks, Inc.
  * Copyright(c) 2015 - 2017 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
@@ -65,14 +66,15 @@
 };
 
 struct tid_rb_node {
-	struct mmu_rb_node mmu;
+	struct mmu_interval_notifier notifier;
+	struct hfi1_filedata *fdata;
 	unsigned long phys;
 	struct tid_group *grp;
 	u32 rcventry;
 	dma_addr_t dma_addr;
 	bool freed;
 	unsigned int npages;
-	struct page *pages[0];
+	struct page *pages[];
 };
 
 static inline int num_user_pages(unsigned long addr,
@@ -94,4 +96,9 @@
 int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
 			      struct hfi1_tid_info *tinfo);
 
+static inline struct mm_struct *mm_from_tid_node(struct tid_rb_node *node)
+{
+	return node->notifier.mm;
+}
+
 #endif /* _HFI1_USER_EXP_RCV_H */
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index 469acb9..3b50500 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -106,7 +106,7 @@
 	int ret;
 	unsigned int gup_flags = FOLL_LONGTERM | (writable ? FOLL_WRITE : 0);
 
-	ret = get_user_pages_fast(vaddr, npages, gup_flags, pages);
+	ret = pin_user_pages_fast(vaddr, npages, gup_flags, pages);
 	if (ret < 0)
 		return ret;
 
@@ -118,7 +118,7 @@
 void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
 			     size_t npages, bool dirty)
 {
-	put_user_pages_dirty_lock(p, npages, dirty);
+	unpin_user_pages_dirty_lock(p, npages, dirty);
 
 	if (mm) { /* during close after signal, mm can be NULL */
 		atomic64_sub(npages, &mm->pinned_vm);
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c
index a92346e..4a4956f 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.c
+++ b/drivers/infiniband/hw/hfi1/user_sdma.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright(c) 2020 - Cornelis Networks, Inc.
  * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
@@ -188,7 +189,6 @@
 	atomic_set(&pq->n_reqs, 0);
 	init_waitqueue_head(&pq->wait);
 	atomic_set(&pq->n_locked, 0);
-	pq->mm = fd->mm;
 
 	iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue,
 		    activate_packet_queue, NULL, NULL);
@@ -230,7 +230,7 @@
 
 	cq->nentries = hfi1_sdma_comp_ring_size;
 
-	ret = hfi1_mmu_rb_register(pq, pq->mm, &sdma_rb_ops, dd->pport->hfi1_wq,
+	ret = hfi1_mmu_rb_register(pq, &sdma_rb_ops, dd->pport->hfi1_wq,
 				   &pq->handler);
 	if (ret) {
 		dd_dev_err(dd, "Failed to register with MMU %d", ret);
@@ -980,13 +980,13 @@
 
 	npages -= node->npages;
 retry:
-	if (!hfi1_can_pin_pages(pq->dd, pq->mm,
+	if (!hfi1_can_pin_pages(pq->dd, current->mm,
 				atomic_read(&pq->n_locked), npages)) {
 		cleared = sdma_cache_evict(pq, npages);
 		if (cleared >= npages)
 			goto retry;
 	}
-	pinned = hfi1_acquire_user_pages(pq->mm,
+	pinned = hfi1_acquire_user_pages(current->mm,
 					 ((unsigned long)iovec->iov.iov_base +
 					 (node->npages * PAGE_SIZE)), npages, 0,
 					 pages + node->npages);
@@ -995,7 +995,7 @@
 		return pinned;
 	}
 	if (pinned != npages) {
-		unpin_vector_pages(pq->mm, pages, node->npages, pinned);
+		unpin_vector_pages(current->mm, pages, node->npages, pinned);
 		return -EFAULT;
 	}
 	kfree(node->pages);
@@ -1008,7 +1008,8 @@
 static void unpin_sdma_pages(struct sdma_mmu_node *node)
 {
 	if (node->npages) {
-		unpin_vector_pages(node->pq->mm, node->pages, 0, node->npages);
+		unpin_vector_pages(mm_from_sdma_node(node), node->pages, 0,
+				   node->npages);
 		atomic_sub(node->npages, &node->pq->n_locked);
 	}
 }
diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h
index 9972e0e..1e8c02f 100644
--- a/drivers/infiniband/hw/hfi1/user_sdma.h
+++ b/drivers/infiniband/hw/hfi1/user_sdma.h
@@ -1,6 +1,7 @@
 #ifndef _HFI1_USER_SDMA_H
 #define _HFI1_USER_SDMA_H
 /*
+ * Copyright(c) 2020 - Cornelis Networks, Inc.
  * Copyright(c) 2015 - 2018 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
@@ -133,7 +134,6 @@
 	unsigned long unpinned;
 	struct mmu_rb_handler *handler;
 	atomic_t n_locked;
-	struct mm_struct *mm;
 };
 
 struct hfi1_user_sdma_comp_q {
@@ -250,4 +250,9 @@
 				   struct iovec *iovec, unsigned long dim,
 				   unsigned long *count);
 
+static inline struct mm_struct *mm_from_sdma_node(struct sdma_mmu_node *node)
+{
+	return node->rb.handler->mn.mm;
+}
+
 #endif /* _HFI1_USER_SDMA_H */
diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 2f6323a..3591923 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2015 - 2018 Intel Corporation.
+ * Copyright(c) 2015 - 2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -66,6 +66,7 @@
 #include "vnic.h"
 #include "fault.h"
 #include "affinity.h"
+#include "ipoib.h"
 
 static unsigned int hfi1_lkey_table_size = 16;
 module_param_named(lkey_table_size, hfi1_lkey_table_size, uint,
@@ -1342,7 +1343,7 @@
 			IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
 			IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE |
 			IB_DEVICE_MEM_MGT_EXTENSIONS |
-			IB_DEVICE_RDMA_NETDEV_OPA_VNIC;
+			IB_DEVICE_RDMA_NETDEV_OPA;
 	rdi->dparms.props.page_size_cap = PAGE_SIZE;
 	rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3;
 	rdi->dparms.props.vendor_part_id = dd->pcidev->device;
@@ -1360,7 +1361,6 @@
 	rdi->dparms.props.max_cq = hfi1_max_cqs;
 	rdi->dparms.props.max_ah = hfi1_max_ahs;
 	rdi->dparms.props.max_cqe = hfi1_max_cqes;
-	rdi->dparms.props.max_map_per_fmr = 32767;
 	rdi->dparms.props.max_pd = hfi1_max_pds;
 	rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC;
 	rdi->dparms.props.max_qp_init_rd_atom = 255;
@@ -1424,7 +1424,7 @@
 	props->gid_tbl_len = HFI1_GUIDS_PER_PORT;
 	props->active_width = (u8)opa_width_to_ib(ppd->link_width_active);
 	/* see rate_show() in ib core/sysfs.c */
-	props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active);
+	props->active_speed = opa_speed_to_ib(ppd->link_speed_active);
 	props->max_vl_num = ppd->vls_supported;
 
 	/* Once we are a "first class" citizen and have added the OPA MTUs to
@@ -1439,6 +1439,8 @@
 				      4096 : hfi1_max_mtu), IB_MTU_4096);
 	props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
 		mtu_to_enum(ppd->ibmtu, IB_MTU_4096);
+	props->phys_mtu = HFI1_CAP_IS_KSET(AIP) ? hfi1_max_mtu :
+				ib_mtu_enum_to_int(props->max_mtu);
 
 	return 0;
 }
@@ -1793,6 +1795,7 @@
 	.modify_device = modify_device,
 	/* keep process mad in the driver */
 	.process_mad = hfi1_process_mad,
+	.rdma_netdev_get_params = hfi1_ipoib_rn_get_params,
 };
 
 /**
@@ -1863,9 +1866,8 @@
 	dd->verbs_dev.rdi.dparms.qpn_start = 0;
 	dd->verbs_dev.rdi.dparms.qpn_inc = 1;
 	dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift;
-	dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16;
-	dd->verbs_dev.rdi.dparms.qpn_res_end =
-	dd->verbs_dev.rdi.dparms.qpn_res_start + 65535;
+	dd->verbs_dev.rdi.dparms.qpn_res_start = RVT_KDETH_QP_BASE;
+	dd->verbs_dev.rdi.dparms.qpn_res_end = RVT_AIP_QP_MAX;
 	dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC;
 	dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK;
 	dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT;
diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h
index ae9582d..d36e3e1 100644
--- a/drivers/infiniband/hw/hfi1/verbs.h
+++ b/drivers/infiniband/hw/hfi1/verbs.h
@@ -107,9 +107,9 @@
 	HFI1_HAS_GRH = (1 << 0),
 };
 
-#define LRH_16B_BYTES (FIELD_SIZEOF(struct hfi1_16b_header, lrh))
+#define LRH_16B_BYTES (sizeof_field(struct hfi1_16b_header, lrh))
 #define LRH_16B_DWORDS (LRH_16B_BYTES / sizeof(u32))
-#define LRH_9B_BYTES (FIELD_SIZEOF(struct ib_header, lrh))
+#define LRH_9B_BYTES (sizeof_field(struct ib_header, lrh))
 #define LRH_9B_DWORDS (LRH_9B_BYTES / sizeof(u32))
 
 /* 24Bits for qpn, upper 8Bits reserved */
@@ -330,9 +330,8 @@
 void hfi1_node_desc_chg(struct hfi1_ibport *ibp);
 int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
 		     const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-		     const struct ib_mad_hdr *in_mad, size_t in_mad_size,
-		     struct ib_mad_hdr *out_mad, size_t *out_mad_size,
-		     u16 *out_mad_pkey_index);
+		     const struct ib_mad *in_mad, struct ib_mad *out_mad,
+		     size_t *out_mad_size, u16 *out_mad_pkey_index);
 
 /*
  * The PSN_MASK and PSN_SHIFT allow for
diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h
index bfa6e08..d2d526c 100644
--- a/drivers/infiniband/hw/hfi1/verbs_txreq.h
+++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h
@@ -91,7 +91,7 @@
 	tx->mr = NULL;
 	tx->sde = priv->s_sde;
 	tx->psc = priv->s_sendcontext;
-	/* so that we can test if the sdma decriptors are there */
+	/* so that we can test if the sdma descriptors are there */
 	tx->txreq.num_desc = 0;
 	/* Set the header type */
 	tx->phdr.hdr.hdr_type = priv->hdr_type;
diff --git a/drivers/infiniband/hw/hfi1/vnic.h b/drivers/infiniband/hw/hfi1/vnic.h
index 5ae7815..66150a1 100644
--- a/drivers/infiniband/hw/hfi1/vnic.h
+++ b/drivers/infiniband/hw/hfi1/vnic.h
@@ -1,7 +1,7 @@
 #ifndef _HFI1_VNIC_H
 #define _HFI1_VNIC_H
 /*
- * Copyright(c) 2017 Intel Corporation.
+ * Copyright(c) 2017 - 2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -69,6 +69,7 @@
 #define HFI1_VNIC_SC_SHIFT      4
 
 #define HFI1_VNIC_MAX_QUEUE 16
+#define HFI1_NUM_VNIC_CTXT 8
 
 /**
  * struct hfi1_vnic_sdma - VNIC per Tx ring SDMA information
@@ -104,7 +105,6 @@
 	struct hfi1_vnic_vport_info *vinfo;
 	struct net_device           *netdev;
 	struct napi_struct           napi;
-	struct sk_buff_head          skbq;
 };
 
 /**
@@ -146,7 +146,6 @@
 
 /* vnic hfi1 internal functions */
 void hfi1_vnic_setup(struct hfi1_devdata *dd);
-void hfi1_vnic_cleanup(struct hfi1_devdata *dd);
 int hfi1_vnic_txreq_init(struct hfi1_devdata *dd);
 void hfi1_vnic_txreq_deinit(struct hfi1_devdata *dd);
 
diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c
index b49e60e..a90824d 100644
--- a/drivers/infiniband/hw/hfi1/vnic_main.c
+++ b/drivers/infiniband/hw/hfi1/vnic_main.c
@@ -1,5 +1,5 @@
 /*
- * Copyright(c) 2017 - 2018 Intel Corporation.
+ * Copyright(c) 2017 - 2020 Intel Corporation.
  *
  * This file is provided under a dual BSD/GPLv2 license.  When using or
  * redistributing this file, you may do so under either license.
@@ -53,6 +53,7 @@
 #include <linux/if_vlan.h>
 
 #include "vnic.h"
+#include "netdev.h"
 
 #define HFI_TX_TIMEOUT_MS 1000
 
@@ -62,114 +63,6 @@
 
 static DEFINE_SPINLOCK(vport_cntr_lock);
 
-static int setup_vnic_ctxt(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt)
-{
-	unsigned int rcvctrl_ops = 0;
-	int ret;
-
-	uctxt->do_interrupt = &handle_receive_interrupt;
-
-	/* Now allocate the RcvHdr queue and eager buffers. */
-	ret = hfi1_create_rcvhdrq(dd, uctxt);
-	if (ret)
-		goto done;
-
-	ret = hfi1_setup_eagerbufs(uctxt);
-	if (ret)
-		goto done;
-
-	if (uctxt->rcvhdrtail_kvaddr)
-		clear_rcvhdrtail(uctxt);
-
-	rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
-	rcvctrl_ops |= HFI1_RCVCTRL_INTRAVAIL_ENB;
-
-	if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR))
-		rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
-	if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL))
-		rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
-	if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
-		rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
-	if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL))
-		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
-
-	hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt);
-done:
-	return ret;
-}
-
-static int allocate_vnic_ctxt(struct hfi1_devdata *dd,
-			      struct hfi1_ctxtdata **vnic_ctxt)
-{
-	struct hfi1_ctxtdata *uctxt;
-	int ret;
-
-	if (dd->flags & HFI1_FROZEN)
-		return -EIO;
-
-	ret = hfi1_create_ctxtdata(dd->pport, dd->node, &uctxt);
-	if (ret < 0) {
-		dd_dev_err(dd, "Unable to create ctxtdata, failing open\n");
-		return -ENOMEM;
-	}
-
-	uctxt->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) |
-			HFI1_CAP_KGET(NODROP_RHQ_FULL) |
-			HFI1_CAP_KGET(NODROP_EGR_FULL) |
-			HFI1_CAP_KGET(DMA_RTAIL);
-	uctxt->seq_cnt = 1;
-	uctxt->is_vnic = true;
-
-	msix_request_rcd_irq(uctxt);
-
-	hfi1_stats.sps_ctxts++;
-	dd_dev_dbg(dd, "created vnic context %d\n", uctxt->ctxt);
-	*vnic_ctxt = uctxt;
-
-	return 0;
-}
-
-static void deallocate_vnic_ctxt(struct hfi1_devdata *dd,
-				 struct hfi1_ctxtdata *uctxt)
-{
-	dd_dev_dbg(dd, "closing vnic context %d\n", uctxt->ctxt);
-	flush_wc();
-
-	/*
-	 * Disable receive context and interrupt available, reset all
-	 * RcvCtxtCtrl bits to default values.
-	 */
-	hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
-		     HFI1_RCVCTRL_TIDFLOW_DIS |
-		     HFI1_RCVCTRL_INTRAVAIL_DIS |
-		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
-		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
-		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt);
-
-	/* msix_intr will always be > 0, only clean up if this is true */
-	if (uctxt->msix_intr)
-		msix_free_irq(dd, uctxt->msix_intr);
-
-	uctxt->event_flags = 0;
-
-	hfi1_clear_tids(uctxt);
-	hfi1_clear_ctxt_pkey(dd, uctxt);
-
-	hfi1_stats.sps_ctxts--;
-
-	hfi1_free_ctxt(uctxt);
-}
-
-void hfi1_vnic_setup(struct hfi1_devdata *dd)
-{
-	xa_init(&dd->vnic.vesws);
-}
-
-void hfi1_vnic_cleanup(struct hfi1_devdata *dd)
-{
-	WARN_ON(!xa_empty(&dd->vnic.vesws));
-}
-
 #define SUM_GRP_COUNTERS(stats, qstats, x_grp) do {            \
 		u64 *src64, *dst64;                            \
 		for (src64 = &qstats->x_grp.unicast,           \
@@ -179,6 +72,9 @@
 		}                                              \
 	} while (0)
 
+#define VNIC_MASK (0xFF)
+#define VNIC_ID(val) ((1ull << 24) | ((val) & VNIC_MASK))
+
 /* hfi1_vnic_update_stats - update statistics */
 static void hfi1_vnic_update_stats(struct hfi1_vnic_vport_info *vinfo,
 				   struct opa_vnic_stats *stats)
@@ -454,71 +350,25 @@
 	return rc;
 }
 
-static inline struct sk_buff *hfi1_vnic_get_skb(struct hfi1_vnic_rx_queue *rxq)
+static struct hfi1_vnic_vport_info *get_vnic_port(struct hfi1_devdata *dd,
+						  int vesw_id)
 {
-	unsigned char *pad_info;
-	struct sk_buff *skb;
+	int vnic_id = VNIC_ID(vesw_id);
 
-	skb = skb_dequeue(&rxq->skbq);
-	if (unlikely(!skb))
+	return hfi1_netdev_get_data(dd, vnic_id);
+}
+
+static struct hfi1_vnic_vport_info *get_first_vnic_port(struct hfi1_devdata *dd)
+{
+	struct hfi1_vnic_vport_info *vinfo;
+	int next_id = VNIC_ID(0);
+
+	vinfo = hfi1_netdev_get_first_data(dd, &next_id);
+
+	if (next_id > VNIC_ID(VNIC_MASK))
 		return NULL;
 
-	/* remove tail padding and icrc */
-	pad_info = skb->data + skb->len - 1;
-	skb_trim(skb, (skb->len - OPA_VNIC_ICRC_TAIL_LEN -
-		       ((*pad_info) & 0x7)));
-
-	return skb;
-}
-
-/* hfi1_vnic_handle_rx - handle skb receive */
-static void hfi1_vnic_handle_rx(struct hfi1_vnic_rx_queue *rxq,
-				int *work_done, int work_to_do)
-{
-	struct hfi1_vnic_vport_info *vinfo = rxq->vinfo;
-	struct sk_buff *skb;
-	int rc;
-
-	while (1) {
-		if (*work_done >= work_to_do)
-			break;
-
-		skb = hfi1_vnic_get_skb(rxq);
-		if (unlikely(!skb))
-			break;
-
-		rc = hfi1_vnic_decap_skb(rxq, skb);
-		/* update rx counters */
-		hfi1_vnic_update_rx_counters(vinfo, rxq->idx, skb, rc);
-		if (unlikely(rc)) {
-			dev_kfree_skb_any(skb);
-			continue;
-		}
-
-		skb_checksum_none_assert(skb);
-		skb->protocol = eth_type_trans(skb, rxq->netdev);
-
-		napi_gro_receive(&rxq->napi, skb);
-		(*work_done)++;
-	}
-}
-
-/* hfi1_vnic_napi - napi receive polling callback function */
-static int hfi1_vnic_napi(struct napi_struct *napi, int budget)
-{
-	struct hfi1_vnic_rx_queue *rxq = container_of(napi,
-					      struct hfi1_vnic_rx_queue, napi);
-	struct hfi1_vnic_vport_info *vinfo = rxq->vinfo;
-	int work_done = 0;
-
-	v_dbg("napi %d budget %d\n", rxq->idx, budget);
-	hfi1_vnic_handle_rx(rxq, &work_done, budget);
-
-	v_dbg("napi %d work_done %d\n", rxq->idx, work_done);
-	if (work_done < budget)
-		napi_complete(napi);
-
-	return work_done;
+	return vinfo;
 }
 
 void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet)
@@ -527,13 +377,14 @@
 	struct hfi1_vnic_vport_info *vinfo = NULL;
 	struct hfi1_vnic_rx_queue *rxq;
 	struct sk_buff *skb;
-	int l4_type, vesw_id = -1;
+	int l4_type, vesw_id = -1, rc;
 	u8 q_idx;
+	unsigned char *pad_info;
 
 	l4_type = hfi1_16B_get_l4(packet->ebuf);
 	if (likely(l4_type == OPA_16B_L4_ETHR)) {
 		vesw_id = HFI1_VNIC_GET_VESWID(packet->ebuf);
-		vinfo = xa_load(&dd->vnic.vesws, vesw_id);
+		vinfo = get_vnic_port(dd, vesw_id);
 
 		/*
 		 * In case of invalid vesw id, count the error on
@@ -541,10 +392,8 @@
 		 */
 		if (unlikely(!vinfo)) {
 			struct hfi1_vnic_vport_info *vinfo_tmp;
-			unsigned long index = 0;
 
-			vinfo_tmp = xa_find(&dd->vnic.vesws, &index, ULONG_MAX,
-					XA_PRESENT);
+			vinfo_tmp = get_first_vnic_port(dd);
 			if (vinfo_tmp) {
 				spin_lock(&vport_cntr_lock);
 				vinfo_tmp->stats[0].netstats.rx_nohandler++;
@@ -563,12 +412,6 @@
 	rxq = &vinfo->rxq[q_idx];
 	if (unlikely(!netif_oper_up(vinfo->netdev))) {
 		vinfo->stats[q_idx].rx_drop_state++;
-		skb_queue_purge(&rxq->skbq);
-		return;
-	}
-
-	if (unlikely(skb_queue_len(&rxq->skbq) > HFI1_VNIC_RCV_Q_SIZE)) {
-		vinfo->stats[q_idx].netstats.rx_fifo_errors++;
 		return;
 	}
 
@@ -580,62 +423,65 @@
 
 	memcpy(skb->data, packet->ebuf, packet->tlen);
 	skb_put(skb, packet->tlen);
-	skb_queue_tail(&rxq->skbq, skb);
 
-	if (napi_schedule_prep(&rxq->napi)) {
-		v_dbg("napi %d scheduling\n", q_idx);
-		__napi_schedule(&rxq->napi);
+	pad_info = skb->data + skb->len - 1;
+	skb_trim(skb, (skb->len - OPA_VNIC_ICRC_TAIL_LEN -
+		       ((*pad_info) & 0x7)));
+
+	rc = hfi1_vnic_decap_skb(rxq, skb);
+
+	/* update rx counters */
+	hfi1_vnic_update_rx_counters(vinfo, rxq->idx, skb, rc);
+	if (unlikely(rc)) {
+		dev_kfree_skb_any(skb);
+		return;
 	}
+
+	skb_checksum_none_assert(skb);
+	skb->protocol = eth_type_trans(skb, rxq->netdev);
+
+	napi_gro_receive(&rxq->napi, skb);
 }
 
 static int hfi1_vnic_up(struct hfi1_vnic_vport_info *vinfo)
 {
 	struct hfi1_devdata *dd = vinfo->dd;
 	struct net_device *netdev = vinfo->netdev;
-	int i, rc;
+	int rc;
 
 	/* ensure virtual eth switch id is valid */
 	if (!vinfo->vesw_id)
 		return -EINVAL;
 
-	rc = xa_insert(&dd->vnic.vesws, vinfo->vesw_id, vinfo, GFP_KERNEL);
+	rc = hfi1_netdev_add_data(dd, VNIC_ID(vinfo->vesw_id), vinfo);
 	if (rc < 0)
 		return rc;
 
-	for (i = 0; i < vinfo->num_rx_q; i++) {
-		struct hfi1_vnic_rx_queue *rxq = &vinfo->rxq[i];
-
-		skb_queue_head_init(&rxq->skbq);
-		napi_enable(&rxq->napi);
-	}
+	rc = hfi1_netdev_rx_init(dd);
+	if (rc)
+		goto err_remove;
 
 	netif_carrier_on(netdev);
 	netif_tx_start_all_queues(netdev);
 	set_bit(HFI1_VNIC_UP, &vinfo->flags);
 
 	return 0;
+
+err_remove:
+	hfi1_netdev_remove_data(dd, VNIC_ID(vinfo->vesw_id));
+	return rc;
 }
 
 static void hfi1_vnic_down(struct hfi1_vnic_vport_info *vinfo)
 {
 	struct hfi1_devdata *dd = vinfo->dd;
-	u8 i;
 
 	clear_bit(HFI1_VNIC_UP, &vinfo->flags);
 	netif_carrier_off(vinfo->netdev);
 	netif_tx_disable(vinfo->netdev);
-	xa_erase(&dd->vnic.vesws, vinfo->vesw_id);
+	hfi1_netdev_remove_data(dd, VNIC_ID(vinfo->vesw_id));
 
-	/* ensure irqs see the change */
-	msix_vnic_synchronize_irq(dd);
-
-	/* remove unread skbs */
-	for (i = 0; i < vinfo->num_rx_q; i++) {
-		struct hfi1_vnic_rx_queue *rxq = &vinfo->rxq[i];
-
-		napi_disable(&rxq->napi);
-		skb_queue_purge(&rxq->skbq);
-	}
+	hfi1_netdev_rx_destroy(dd);
 }
 
 static int hfi1_netdev_open(struct net_device *netdev)
@@ -660,70 +506,31 @@
 	return 0;
 }
 
-static int hfi1_vnic_allot_ctxt(struct hfi1_devdata *dd,
-				struct hfi1_ctxtdata **vnic_ctxt)
-{
-	int rc;
-
-	rc = allocate_vnic_ctxt(dd, vnic_ctxt);
-	if (rc) {
-		dd_dev_err(dd, "vnic ctxt alloc failed %d\n", rc);
-		return rc;
-	}
-
-	rc = setup_vnic_ctxt(dd, *vnic_ctxt);
-	if (rc) {
-		dd_dev_err(dd, "vnic ctxt setup failed %d\n", rc);
-		deallocate_vnic_ctxt(dd, *vnic_ctxt);
-		*vnic_ctxt = NULL;
-	}
-
-	return rc;
-}
-
 static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo)
 {
 	struct hfi1_devdata *dd = vinfo->dd;
-	int i, rc = 0;
+	int rc = 0;
 
 	mutex_lock(&hfi1_mutex);
-	if (!dd->vnic.num_vports) {
+	if (!dd->vnic_num_vports) {
 		rc = hfi1_vnic_txreq_init(dd);
 		if (rc)
 			goto txreq_fail;
 	}
 
-	for (i = dd->vnic.num_ctxt; i < vinfo->num_rx_q; i++) {
-		rc = hfi1_vnic_allot_ctxt(dd, &dd->vnic.ctxt[i]);
-		if (rc)
-			break;
-		hfi1_rcd_get(dd->vnic.ctxt[i]);
-		dd->vnic.ctxt[i]->vnic_q_idx = i;
-	}
-
-	if (i < vinfo->num_rx_q) {
-		/*
-		 * If required amount of contexts is not
-		 * allocated successfully then remaining contexts
-		 * are released.
-		 */
-		while (i-- > dd->vnic.num_ctxt) {
-			deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]);
-			hfi1_rcd_put(dd->vnic.ctxt[i]);
-			dd->vnic.ctxt[i] = NULL;
-		}
+	rc = hfi1_netdev_rx_init(dd);
+	if (rc) {
+		dd_dev_err(dd, "Unable to initialize netdev contexts\n");
 		goto alloc_fail;
 	}
 
-	if (dd->vnic.num_ctxt != i) {
-		dd->vnic.num_ctxt = i;
-		hfi1_init_vnic_rsm(dd);
-	}
+	hfi1_init_vnic_rsm(dd);
 
-	dd->vnic.num_vports++;
+	dd->vnic_num_vports++;
 	hfi1_vnic_sdma_init(vinfo);
+
 alloc_fail:
-	if (!dd->vnic.num_vports)
+	if (!dd->vnic_num_vports)
 		hfi1_vnic_txreq_deinit(dd);
 txreq_fail:
 	mutex_unlock(&hfi1_mutex);
@@ -733,20 +540,14 @@
 static void hfi1_vnic_deinit(struct hfi1_vnic_vport_info *vinfo)
 {
 	struct hfi1_devdata *dd = vinfo->dd;
-	int i;
 
 	mutex_lock(&hfi1_mutex);
-	if (--dd->vnic.num_vports == 0) {
-		for (i = 0; i < dd->vnic.num_ctxt; i++) {
-			deallocate_vnic_ctxt(dd, dd->vnic.ctxt[i]);
-			hfi1_rcd_put(dd->vnic.ctxt[i]);
-			dd->vnic.ctxt[i] = NULL;
-		}
+	if (--dd->vnic_num_vports == 0) {
 		hfi1_deinit_vnic_rsm(dd);
-		dd->vnic.num_ctxt = 0;
 		hfi1_vnic_txreq_deinit(dd);
 	}
 	mutex_unlock(&hfi1_mutex);
+	hfi1_netdev_rx_destroy(dd);
 }
 
 static void hfi1_vnic_set_vesw_id(struct net_device *netdev, int id)
@@ -804,7 +605,7 @@
 	struct rdma_netdev *rn;
 	int i, size, rc;
 
-	if (!dd->num_vnic_contexts)
+	if (!dd->num_netdev_contexts)
 		return ERR_PTR(-ENOMEM);
 
 	if (!port_num || (port_num > dd->num_pports))
@@ -815,15 +616,16 @@
 
 	size = sizeof(struct opa_vnic_rdma_netdev) + sizeof(*vinfo);
 	netdev = alloc_netdev_mqs(size, name, name_assign_type, setup,
-				  dd->num_sdma, dd->num_vnic_contexts);
+				  chip_sdma_engines(dd),
+				  dd->num_netdev_contexts);
 	if (!netdev)
 		return ERR_PTR(-ENOMEM);
 
 	rn = netdev_priv(netdev);
 	vinfo = opa_vnic_dev_priv(netdev);
 	vinfo->dd = dd;
-	vinfo->num_tx_q = dd->num_sdma;
-	vinfo->num_rx_q = dd->num_vnic_contexts;
+	vinfo->num_tx_q = chip_sdma_engines(dd);
+	vinfo->num_rx_q = dd->num_netdev_contexts;
 	vinfo->netdev = netdev;
 	rn->free_rdma_netdev = hfi1_vnic_free_rn;
 	rn->set_id = hfi1_vnic_set_vesw_id;
@@ -841,7 +643,6 @@
 		rxq->idx = i;
 		rxq->vinfo = vinfo;
 		rxq->netdev = netdev;
-		netif_napi_add(netdev, &rxq->napi, hfi1_vnic_napi, 64);
 	}
 
 	rc = hfi1_vnic_init(vinfo);
diff --git a/drivers/infiniband/hw/hns/Kconfig b/drivers/infiniband/hw/hns/Kconfig
index 4921c1e..18d10eb 100644
--- a/drivers/infiniband/hw/hns/Kconfig
+++ b/drivers/infiniband/hw/hns/Kconfig
@@ -4,7 +4,7 @@
 	depends on NET_VENDOR_HISILICON
 	depends on ARM64 || (COMPILE_TEST && 64BIT)
 	depends on (HNS_DSAF && HNS_ENET) || HNS3
-	---help---
+	help
 	  This is a RoCE/RDMA driver for the Hisilicon RoCE engine. The engine
 	  is used in Hisilicon Hip06 and more further ICT SoC based on
 	  platform device.
@@ -15,7 +15,7 @@
 	bool "Hisilicon Hip06 Family RoCE support"
 	depends on INFINIBAND_HNS && HNS && HNS_DSAF && HNS_ENET
 	depends on INFINIBAND_HNS=m || (HNS_DSAF=y && HNS_ENET=y)
-	---help---
+	help
 	  RoCE driver support for Hisilicon RoCE engine in Hisilicon Hip06 and
 	  Hip07 SoC. These RoCE engines are platform devices.
 
@@ -26,7 +26,7 @@
 	bool "Hisilicon Hip08 Family RoCE support"
 	depends on INFINIBAND_HNS && PCI && HNS3
 	depends on INFINIBAND_HNS=m || HNS3=y
-	---help---
+	help
 	  RoCE driver support for Hisilicon RoCE engine in Hisilicon Hip08 SoC.
 	  The RoCE engine is a PCI device.
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c
index 90e08c0..174b19e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_ah.c
+++ b/drivers/infiniband/hw/hns/hns_roce_ah.c
@@ -31,55 +31,62 @@
  */
 
 #include <linux/platform_device.h>
+#include <linux/pci.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
 #include "hns_roce_device.h"
 
-#define HNS_ROCE_PORT_NUM_SHIFT		24
-#define HNS_ROCE_VLAN_SL_BIT_MASK	7
-#define HNS_ROCE_VLAN_SL_SHIFT		13
-
-int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
-		       u32 flags, struct ib_udata *udata)
+static inline u16 get_ah_udp_sport(const struct rdma_ah_attr *ah_attr)
 {
-	struct hns_roce_dev *hr_dev = to_hr_dev(ibah->device);
-	const struct ib_gid_attr *gid_attr;
-	struct device *dev = hr_dev->dev;
-	struct hns_roce_ah *ah = to_hr_ah(ibah);
-	u16 vlan_tag = 0xffff;
+	u32 fl = ah_attr->grh.flow_label;
+	u16 sport;
+
+	if (!fl)
+		sport = get_random_u32() %
+			(IB_ROCE_UDP_ENCAP_VALID_PORT_MAX + 1 -
+			 IB_ROCE_UDP_ENCAP_VALID_PORT_MIN) +
+			IB_ROCE_UDP_ENCAP_VALID_PORT_MIN;
+	else
+		sport = rdma_flow_label_to_udp_sport(fl);
+
+	return sport;
+}
+
+int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
+		       struct ib_udata *udata)
+{
+	struct rdma_ah_attr *ah_attr = init_attr->ah_attr;
 	const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
-	bool vlan_en = false;
-	int ret;
-
-	gid_attr = ah_attr->grh.sgid_attr;
-	ret = rdma_read_gid_l2_fields(gid_attr, &vlan_tag, NULL);
-	if (ret)
-		return ret;
-
-	/* Get mac address */
-	memcpy(ah->av.mac, ah_attr->roce.dmac, ETH_ALEN);
-
-	if (vlan_tag < VLAN_CFI_MASK) {
-		vlan_en = true;
-		vlan_tag |= (rdma_ah_get_sl(ah_attr) &
-			     HNS_ROCE_VLAN_SL_BIT_MASK) <<
-			     HNS_ROCE_VLAN_SL_SHIFT;
-	}
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibah->device);
+	struct hns_roce_ah *ah = to_hr_ah(ibah);
+	int ret = 0;
 
 	ah->av.port = rdma_ah_get_port_num(ah_attr);
 	ah->av.gid_index = grh->sgid_index;
-	ah->av.vlan = vlan_tag;
-	ah->av.vlan_en = vlan_en;
-	dev_dbg(dev, "gid_index = 0x%x,vlan = 0x%x\n", ah->av.gid_index,
-		ah->av.vlan);
 
 	if (rdma_ah_get_static_rate(ah_attr))
 		ah->av.stat_rate = IB_RATE_10_GBPS;
 
-	memcpy(ah->av.dgid, grh->dgid.raw, HNS_ROCE_GID_SIZE);
+	ah->av.hop_limit = grh->hop_limit;
+	ah->av.flowlabel = grh->flow_label;
+	ah->av.udp_sport = get_ah_udp_sport(ah_attr);
 	ah->av.sl = rdma_ah_get_sl(ah_attr);
+	ah->av.tclass = get_tclass(grh);
 
-	return 0;
+	memcpy(ah->av.dgid, grh->dgid.raw, HNS_ROCE_GID_SIZE);
+	memcpy(ah->av.mac, ah_attr->roce.dmac, ETH_ALEN);
+
+	/* HIP08 needs to record vlan info in Address Vector */
+	if (hr_dev->pci_dev->revision <= PCI_REVISION_ID_HIP08) {
+		ret = rdma_read_gid_l2_fields(ah_attr->grh.sgid_attr,
+					      &ah->av.vlan_id, NULL);
+		if (ret)
+			return ret;
+
+		ah->av.vlan_en = ah->av.vlan_id < VLAN_N_VID;
+	}
+
+	return ret;
 }
 
 int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
@@ -97,8 +104,3 @@
 
 	return 0;
 }
-
-void hns_roce_destroy_ah(struct ib_ah *ah, u32 flags)
-{
-	return;
-}
diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c b/drivers/infiniband/hw/hns/hns_roce_alloc.c
index 8c063c5..5b2baf8 100644
--- a/drivers/infiniband/hw/hns/hns_roce_alloc.c
+++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c
@@ -55,7 +55,7 @@
 			bitmap->last = 0;
 		*obj |= bitmap->top;
 	} else {
-		ret = -1;
+		ret = -EINVAL;
 	}
 
 	spin_unlock(&bitmap->lock);
@@ -100,7 +100,7 @@
 		}
 		*obj |= bitmap->top;
 	} else {
-		ret = -1;
+		ret = -EINVAL;
 	}
 
 	spin_unlock(&bitmap->lock);
@@ -157,84 +157,78 @@
 	kfree(bitmap->table);
 }
 
-void hns_roce_buf_free(struct hns_roce_dev *hr_dev, u32 size,
-		       struct hns_roce_buf *buf)
+void hns_roce_buf_free(struct hns_roce_dev *hr_dev, struct hns_roce_buf *buf)
 {
-	int i;
 	struct device *dev = hr_dev->dev;
+	u32 size = buf->size;
+	int i;
 
-	if (buf->nbufs == 1) {
+	if (size == 0)
+		return;
+
+	buf->size = 0;
+
+	if (hns_roce_buf_is_direct(buf)) {
 		dma_free_coherent(dev, size, buf->direct.buf, buf->direct.map);
 	} else {
-		for (i = 0; i < buf->nbufs; ++i)
+		for (i = 0; i < buf->npages; ++i)
 			if (buf->page_list[i].buf)
 				dma_free_coherent(dev, 1 << buf->page_shift,
 						  buf->page_list[i].buf,
 						  buf->page_list[i].map);
 		kfree(buf->page_list);
+		buf->page_list = NULL;
 	}
 }
 
 int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct,
 		       struct hns_roce_buf *buf, u32 page_shift)
 {
-	int i = 0;
-	dma_addr_t t;
+	struct hns_roce_buf_list *buf_list;
 	struct device *dev = hr_dev->dev;
-	u32 page_size = 1 << page_shift;
-	u32 order;
+	u32 page_size;
+	int i;
 
-	/* SQ/RQ buf lease than one page, SQ + RQ = 8K */
+	/* The minimum shift of the page accessed by hw is HNS_HW_PAGE_SHIFT */
+	buf->page_shift = max_t(int, HNS_HW_PAGE_SHIFT, page_shift);
+
+	page_size = 1 << buf->page_shift;
+	buf->npages = DIV_ROUND_UP(size, page_size);
+
+	/* required size is not bigger than one trunk size */
 	if (size <= max_direct) {
-		buf->nbufs = 1;
-		/* Npages calculated by page_size */
-		order = get_order(size);
-		if (order <= page_shift - PAGE_SHIFT)
-			order = 0;
-		else
-			order -= page_shift - PAGE_SHIFT;
-		buf->npages = 1 << order;
-		buf->page_shift = page_shift;
-		/* MTT PA must be recorded in 4k alignment, t is 4k aligned */
-		buf->direct.buf = dma_alloc_coherent(dev, size, &t,
+		buf->page_list = NULL;
+		buf->direct.buf = dma_alloc_coherent(dev, size,
+						     &buf->direct.map,
 						     GFP_KERNEL);
 		if (!buf->direct.buf)
 			return -ENOMEM;
-
-		buf->direct.map = t;
-
-		while (t & ((1 << buf->page_shift) - 1)) {
-			--buf->page_shift;
-			buf->npages *= 2;
-		}
 	} else {
-		buf->nbufs = (size + page_size - 1) / page_size;
-		buf->npages = buf->nbufs;
-		buf->page_shift = page_shift;
-		buf->page_list = kcalloc(buf->nbufs, sizeof(*buf->page_list),
-					 GFP_KERNEL);
-
-		if (!buf->page_list)
+		buf_list = kcalloc(buf->npages, sizeof(*buf_list), GFP_KERNEL);
+		if (!buf_list)
 			return -ENOMEM;
 
-		for (i = 0; i < buf->nbufs; ++i) {
-			buf->page_list[i].buf = dma_alloc_coherent(dev,
-								   page_size,
-								   &t,
-								   GFP_KERNEL);
-
-			if (!buf->page_list[i].buf)
-				goto err_free;
-
-			buf->page_list[i].map = t;
+		for (i = 0; i < buf->npages; i++) {
+			buf_list[i].buf = dma_alloc_coherent(dev, page_size,
+							     &buf_list[i].map,
+							     GFP_KERNEL);
+			if (!buf_list[i].buf)
+				break;
 		}
+
+		if (i != buf->npages && i > 0) {
+			while (i-- > 0)
+				dma_free_coherent(dev, page_size,
+						  buf_list[i].buf,
+						  buf_list[i].map);
+			kfree(buf_list);
+			return -ENOMEM;
+		}
+		buf->page_list = buf_list;
 	}
+	buf->size = size;
 
 	return 0;
-
-err_free:
-	hns_roce_buf_free(hr_dev, size, buf);
-	return -ENOMEM;
 }
 
 int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs,
@@ -246,39 +240,35 @@
 	end = start + buf_cnt;
 	if (end > buf->npages) {
 		dev_err(hr_dev->dev,
-			"invalid kmem region,offset %d,buf_cnt %d,total %d!\n",
+			"failed to check kmem bufs, end %d + %d total %u!\n",
 			start, buf_cnt, buf->npages);
 		return -EINVAL;
 	}
 
 	total = 0;
 	for (i = start; i < end; i++)
-		if (buf->nbufs == 1)
-			bufs[total++] = buf->direct.map +
-					((dma_addr_t)i << buf->page_shift);
-		else
-			bufs[total++] = buf->page_list[i].map;
+		bufs[total++] = hns_roce_buf_page(buf, i);
 
 	return total;
 }
 
 int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs,
 			   int buf_cnt, int start, struct ib_umem *umem,
-			   int page_shift)
+			   unsigned int page_shift)
 {
 	struct ib_block_iter biter;
 	int total = 0;
 	int idx = 0;
 	u64 addr;
 
-	if (page_shift < PAGE_SHIFT) {
-		dev_err(hr_dev->dev, "invalid page shift %d!\n", page_shift);
+	if (page_shift < HNS_HW_PAGE_SHIFT) {
+		dev_err(hr_dev->dev, "failed to check umem page shift %u!\n",
+			page_shift);
 		return -EINVAL;
 	}
 
 	/* convert system page cnt to hw page cnt */
-	rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap,
-			    1 << page_shift) {
+	rdma_umem_for_each_dma_block(umem, &biter, 1 << page_shift) {
 		addr = rdma_block_iter_dma_address(&biter);
 		if (idx >= start) {
 			bufs[total++] = addr;
@@ -292,49 +282,6 @@
 	return total;
 }
 
-void hns_roce_init_buf_region(struct hns_roce_buf_region *region, int hopnum,
-			      int offset, int buf_cnt)
-{
-	if (hopnum == HNS_ROCE_HOP_NUM_0)
-		region->hopnum = 0;
-	else
-		region->hopnum = hopnum;
-
-	region->offset = offset;
-	region->count = buf_cnt;
-}
-
-void hns_roce_free_buf_list(dma_addr_t **bufs, int region_cnt)
-{
-	int i;
-
-	for (i = 0; i < region_cnt; i++) {
-		kfree(bufs[i]);
-		bufs[i] = NULL;
-	}
-}
-
-int hns_roce_alloc_buf_list(struct hns_roce_buf_region *regions,
-			    dma_addr_t **bufs, int region_cnt)
-{
-	struct hns_roce_buf_region *r;
-	int i;
-
-	for (i = 0; i < region_cnt; i++) {
-		r = &regions[i];
-		bufs[i] = kcalloc(r->count, sizeof(dma_addr_t), GFP_KERNEL);
-		if (!bufs[i])
-			goto err_alloc;
-	}
-
-	return 0;
-
-err_alloc:
-	hns_roce_free_buf_list(bufs, i);
-
-	return -ENOMEM;
-}
-
 void hns_roce_cleanup_bitmap(struct hns_roce_dev *hr_dev)
 {
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ)
diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h b/drivers/infiniband/hw/hns/hns_roce_cmd.h
index 2b6ac64..1915bac 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cmd.h
+++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h
@@ -115,12 +115,12 @@
 
 enum {
 	/* TPT commands */
-	HNS_ROCE_CMD_SW2HW_MPT		= 0xd,
-	HNS_ROCE_CMD_HW2SW_MPT		= 0xf,
+	HNS_ROCE_CMD_CREATE_MPT		= 0xd,
+	HNS_ROCE_CMD_DESTROY_MPT	= 0xf,
 
 	/* CQ commands */
-	HNS_ROCE_CMD_SW2HW_CQ		= 0x16,
-	HNS_ROCE_CMD_HW2SW_CQ		= 0x17,
+	HNS_ROCE_CMD_CREATE_CQC		= 0x16,
+	HNS_ROCE_CMD_DESTROY_CQC	= 0x17,
 
 	/* QP/EE commands */
 	HNS_ROCE_CMD_RST2INIT_QP	= 0x19,
@@ -129,14 +129,14 @@
 	HNS_ROCE_CMD_RTS2RTS_QP		= 0x1c,
 	HNS_ROCE_CMD_2ERR_QP		= 0x1e,
 	HNS_ROCE_CMD_RTS2SQD_QP		= 0x1f,
-	HNS_ROCE_CMD_SQD2SQD_QP		= 0x38,
 	HNS_ROCE_CMD_SQD2RTS_QP		= 0x20,
 	HNS_ROCE_CMD_2RST_QP		= 0x21,
 	HNS_ROCE_CMD_QUERY_QP		= 0x22,
-	HNS_ROCE_CMD_SW2HW_SRQ		= 0x70,
+	HNS_ROCE_CMD_SQD2SQD_QP		= 0x38,
+	HNS_ROCE_CMD_CREATE_SRQ		= 0x70,
 	HNS_ROCE_CMD_MODIFY_SRQC	= 0x72,
 	HNS_ROCE_CMD_QUERY_SRQC		= 0x73,
-	HNS_ROCE_CMD_HW2SW_SRQ		= 0x74,
+	HNS_ROCE_CMD_DESTROY_SRQ	= 0x74,
 };
 
 int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param,
diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h
index 8e95a1a..f5669ff 100644
--- a/drivers/infiniband/hw/hns/hns_roce_common.h
+++ b/drivers/infiniband/hw/hns/hns_roce_common.h
@@ -33,10 +33,6 @@
 #ifndef _HNS_ROCE_COMMON_H
 #define _HNS_ROCE_COMMON_H
 
-#ifndef assert
-#define assert(cond)
-#endif
-
 #define roce_write(dev, reg, val)	writel((val), (dev)->reg_base + (reg))
 #define roce_read(dev, reg)		readl((dev)->reg_base + (reg))
 #define roce_raw_write(value, addr) \
diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c
index 22541d1..8a6bded 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -39,97 +39,40 @@
 #include <rdma/hns-abi.h>
 #include "hns_roce_common.h"
 
-static void hns_roce_ib_cq_comp(struct hns_roce_cq *hr_cq)
-{
-	struct ib_cq *ibcq = &hr_cq->ib_cq;
-
-	ibcq->comp_handler(ibcq, ibcq->cq_context);
-}
-
-static void hns_roce_ib_cq_event(struct hns_roce_cq *hr_cq,
-				 enum hns_roce_event event_type)
-{
-	struct hns_roce_dev *hr_dev;
-	struct ib_event event;
-	struct ib_cq *ibcq;
-
-	ibcq = &hr_cq->ib_cq;
-	hr_dev = to_hr_dev(ibcq->device);
-
-	if (event_type != HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID &&
-	    event_type != HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR &&
-	    event_type != HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW) {
-		dev_err(hr_dev->dev,
-			"hns_roce_ib: Unexpected event type 0x%x on CQ %06lx\n",
-			event_type, hr_cq->cqn);
-		return;
-	}
-
-	if (ibcq->event_handler) {
-		event.device = ibcq->device;
-		event.event = IB_EVENT_CQ_ERR;
-		event.element.cq = ibcq;
-		ibcq->event_handler(&event, ibcq->cq_context);
-	}
-}
-
-static int hns_roce_sw2hw_cq(struct hns_roce_dev *dev,
-			     struct hns_roce_cmd_mailbox *mailbox,
-			     unsigned long cq_num)
-{
-	return hns_roce_cmd_mbox(dev, mailbox->dma, 0, cq_num, 0,
-			    HNS_ROCE_CMD_SW2HW_CQ, HNS_ROCE_CMD_TIMEOUT_MSECS);
-}
-
-static int hns_roce_cq_alloc(struct hns_roce_dev *hr_dev, int nent,
-			     struct hns_roce_mtt *hr_mtt,
-			     struct hns_roce_cq *hr_cq, int vector)
+static int alloc_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
 {
 	struct hns_roce_cmd_mailbox *mailbox;
-	struct hns_roce_hem_table *mtt_table;
 	struct hns_roce_cq_table *cq_table;
-	struct device *dev = hr_dev->dev;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	u64 mtts[MTT_MIN_COUNT] = { 0 };
 	dma_addr_t dma_handle;
-	u64 *mtts;
 	int ret;
 
+	ret = hns_roce_mtr_find(hr_dev, &hr_cq->mtr, 0, mtts, ARRAY_SIZE(mtts),
+				&dma_handle);
+	if (!ret) {
+		ibdev_err(ibdev, "failed to find CQ mtr, ret = %d.\n", ret);
+		return -EINVAL;
+	}
+
 	cq_table = &hr_dev->cq_table;
-
-	/* Get the physical address of cq buf */
-	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
-		mtt_table = &hr_dev->mr_table.mtt_cqe_table;
-	else
-		mtt_table = &hr_dev->mr_table.mtt_table;
-
-	mtts = hns_roce_table_find(hr_dev, mtt_table,
-				   hr_mtt->first_seg, &dma_handle);
-	if (!mtts) {
-		dev_err(dev, "CQ alloc.Failed to find cq buf addr.\n");
-		return -EINVAL;
-	}
-
-	if (vector >= hr_dev->caps.num_comp_vectors) {
-		dev_err(dev, "CQ alloc.Invalid vector.\n");
-		return -EINVAL;
-	}
-	hr_cq->vector = vector;
-
 	ret = hns_roce_bitmap_alloc(&cq_table->bitmap, &hr_cq->cqn);
-	if (ret == -1) {
-		dev_err(dev, "CQ alloc.Failed to alloc index.\n");
-		return -ENOMEM;
+	if (ret) {
+		ibdev_err(ibdev, "failed to alloc CQ bitmap, ret = %d.\n", ret);
+		return ret;
 	}
 
 	/* Get CQC memory HEM(Hardware Entry Memory) table */
 	ret = hns_roce_table_get(hr_dev, &cq_table->table, hr_cq->cqn);
 	if (ret) {
-		dev_err(dev, "CQ alloc.Failed to get context mem.\n");
+		ibdev_err(ibdev, "failed to get CQ(0x%lx) context, ret = %d.\n",
+			  hr_cq->cqn, ret);
 		goto err_out;
 	}
 
 	ret = xa_err(xa_store(&cq_table->array, hr_cq->cqn, hr_cq, GFP_KERNEL));
 	if (ret) {
-		dev_err(dev, "CQ alloc failed xa_store.\n");
+		ibdev_err(ibdev, "failed to xa_store CQ, ret = %d.\n", ret);
 		goto err_put;
 	}
 
@@ -140,14 +83,16 @@
 		goto err_xa;
 	}
 
-	hr_dev->hw->write_cqc(hr_dev, hr_cq, mailbox->buf, mtts, dma_handle,
-			      nent, vector);
+	hr_dev->hw->write_cqc(hr_dev, hr_cq, mailbox->buf, mtts, dma_handle);
 
 	/* Send mailbox to hw */
-	ret = hns_roce_sw2hw_cq(hr_dev, mailbox, hr_cq->cqn);
+	ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_cq->cqn, 0,
+			HNS_ROCE_CMD_CREATE_CQC, HNS_ROCE_CMD_TIMEOUT_MSECS);
 	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
 	if (ret) {
-		dev_err(dev, "CQ alloc.Failed to cmd mailbox.\n");
+		ibdev_err(ibdev,
+			  "failed to send create cmd for CQ(0x%lx), ret = %d.\n",
+			  hr_cq->cqn, ret);
 		goto err_xa;
 	}
 
@@ -170,24 +115,17 @@
 	return ret;
 }
 
-static int hns_roce_hw2sw_cq(struct hns_roce_dev *dev,
-			     struct hns_roce_cmd_mailbox *mailbox,
-			     unsigned long cq_num)
-{
-	return hns_roce_cmd_mbox(dev, 0, mailbox ? mailbox->dma : 0, cq_num,
-				 mailbox ? 0 : 1, HNS_ROCE_CMD_HW2SW_CQ,
-				 HNS_ROCE_CMD_TIMEOUT_MSECS);
-}
-
-void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
+static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
 {
 	struct hns_roce_cq_table *cq_table = &hr_dev->cq_table;
 	struct device *dev = hr_dev->dev;
 	int ret;
 
-	ret = hns_roce_hw2sw_cq(hr_dev, NULL, hr_cq->cqn);
+	ret = hns_roce_cmd_mbox(hr_dev, 0, 0, hr_cq->cqn, 1,
+				HNS_ROCE_CMD_DESTROY_CQC,
+				HNS_ROCE_CMD_TIMEOUT_MSECS);
 	if (ret)
-		dev_err(dev, "HW2SW_CQ failed (%d) for CQN %06lx\n", ret,
+		dev_err(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", ret,
 			hr_cq->cqn);
 
 	xa_erase(&cq_table->array, hr_cq->cqn);
@@ -204,272 +142,180 @@
 	hns_roce_bitmap_free(&cq_table->bitmap, hr_cq->cqn, BITMAP_NO_RR);
 }
 
-static int hns_roce_ib_get_cq_umem(struct hns_roce_dev *hr_dev,
-				   struct ib_udata *udata,
-				   struct hns_roce_cq_buf *buf,
-				   struct ib_umem **umem, u64 buf_addr, int cqe)
+static int alloc_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
+			struct ib_udata *udata, unsigned long addr)
 {
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_buf_attr buf_attr = {};
 	int ret;
-	u32 page_shift;
-	u32 npages;
 
-	*umem = ib_umem_get(udata, buf_addr, cqe * hr_dev->caps.cq_entry_sz,
-			    IB_ACCESS_LOCAL_WRITE, 1);
-	if (IS_ERR(*umem))
-		return PTR_ERR(*umem);
+	buf_attr.page_shift = hr_dev->caps.cqe_buf_pg_sz + HNS_HW_PAGE_SHIFT;
+	buf_attr.region[0].size = hr_cq->cq_depth * hr_cq->cqe_size;
+	buf_attr.region[0].hopnum = hr_dev->caps.cqe_hop_num;
+	buf_attr.region_count = 1;
+	buf_attr.fixed_page = true;
 
-	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
-		buf->hr_mtt.mtt_type = MTT_TYPE_CQE;
-	else
-		buf->hr_mtt.mtt_type = MTT_TYPE_WQE;
-
-	if (hr_dev->caps.cqe_buf_pg_sz) {
-		npages = (ib_umem_page_count(*umem) +
-			(1 << hr_dev->caps.cqe_buf_pg_sz) - 1) /
-			(1 << hr_dev->caps.cqe_buf_pg_sz);
-		page_shift = PAGE_SHIFT + hr_dev->caps.cqe_buf_pg_sz;
-		ret = hns_roce_mtt_init(hr_dev, npages, page_shift,
-					&buf->hr_mtt);
-	} else {
-		ret = hns_roce_mtt_init(hr_dev, ib_umem_page_count(*umem),
-					PAGE_SHIFT, &buf->hr_mtt);
-	}
+	ret = hns_roce_mtr_create(hr_dev, &hr_cq->mtr, &buf_attr,
+				  hr_dev->caps.cqe_ba_pg_sz + HNS_HW_PAGE_SHIFT,
+				  udata, addr);
 	if (ret)
-		goto err_buf;
+		ibdev_err(ibdev, "failed to alloc CQ mtr, ret = %d.\n", ret);
 
-	ret = hns_roce_ib_umem_write_mtt(hr_dev, &buf->hr_mtt, *umem);
-	if (ret)
-		goto err_mtt;
-
-	return 0;
-
-err_mtt:
-	hns_roce_mtt_cleanup(hr_dev, &buf->hr_mtt);
-
-err_buf:
-	ib_umem_release(*umem);
 	return ret;
 }
 
-static int hns_roce_ib_alloc_cq_buf(struct hns_roce_dev *hr_dev,
-				    struct hns_roce_cq_buf *buf, u32 nent)
+static void free_cq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
 {
-	int ret;
-	u32 page_shift = PAGE_SHIFT + hr_dev->caps.cqe_buf_pg_sz;
-
-	ret = hns_roce_buf_alloc(hr_dev, nent * hr_dev->caps.cq_entry_sz,
-				 (1 << page_shift) * 2, &buf->hr_buf,
-				 page_shift);
-	if (ret)
-		goto out;
-
-	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
-		buf->hr_mtt.mtt_type = MTT_TYPE_CQE;
-	else
-		buf->hr_mtt.mtt_type = MTT_TYPE_WQE;
-
-	ret = hns_roce_mtt_init(hr_dev, buf->hr_buf.npages,
-				buf->hr_buf.page_shift, &buf->hr_mtt);
-	if (ret)
-		goto err_buf;
-
-	ret = hns_roce_buf_write_mtt(hr_dev, &buf->hr_mtt, &buf->hr_buf);
-	if (ret)
-		goto err_mtt;
-
-	return 0;
-
-err_mtt:
-	hns_roce_mtt_cleanup(hr_dev, &buf->hr_mtt);
-
-err_buf:
-	hns_roce_buf_free(hr_dev, nent * hr_dev->caps.cq_entry_sz,
-			  &buf->hr_buf);
-out:
-	return ret;
+	hns_roce_mtr_destroy(hr_dev, &hr_cq->mtr);
 }
 
-static void hns_roce_ib_free_cq_buf(struct hns_roce_dev *hr_dev,
-				    struct hns_roce_cq_buf *buf, int cqe)
+static int alloc_cq_db(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
+		       struct ib_udata *udata, unsigned long addr,
+		       struct hns_roce_ib_create_cq_resp *resp)
 {
-	hns_roce_buf_free(hr_dev, (cqe + 1) * hr_dev->caps.cq_entry_sz,
-			  &buf->hr_buf);
-}
+	bool has_db = hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB;
+	struct hns_roce_ucontext *uctx;
+	int err;
 
-static int create_user_cq(struct hns_roce_dev *hr_dev,
-			  struct hns_roce_cq *hr_cq,
-			  struct ib_udata *udata,
-			  struct hns_roce_ib_create_cq_resp *resp,
-			  int cq_entries)
-{
-	struct hns_roce_ib_create_cq ucmd;
-	struct device *dev = hr_dev->dev;
-	int ret;
-	struct hns_roce_ucontext *context = rdma_udata_to_drv_context(
-				   udata, struct hns_roce_ucontext, ibucontext);
-
-	if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
-		dev_err(dev, "Failed to copy_from_udata.\n");
-		return -EFAULT;
-	}
-
-	/* Get user space address, write it into mtt table */
-	ret = hns_roce_ib_get_cq_umem(hr_dev, udata, &hr_cq->hr_buf,
-				      &hr_cq->umem, ucmd.buf_addr,
-				      cq_entries);
-	if (ret) {
-		dev_err(dev, "Failed to get_cq_umem.\n");
-		return ret;
-	}
-
-	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
-	    (udata->outlen >= sizeof(*resp))) {
-		ret = hns_roce_db_map_user(context, udata, ucmd.db_addr,
-					   &hr_cq->db);
-		if (ret) {
-			dev_err(dev, "cq record doorbell map failed!\n");
-			goto err_mtt;
+	if (udata) {
+		if (has_db &&
+		    udata->outlen >= offsetofend(typeof(*resp), cap_flags)) {
+			uctx = rdma_udata_to_drv_context(udata,
+					struct hns_roce_ucontext, ibucontext);
+			err = hns_roce_db_map_user(uctx, udata, addr,
+						   &hr_cq->db);
+			if (err)
+				return err;
+			hr_cq->flags |= HNS_ROCE_CQ_FLAG_RECORD_DB;
+			resp->cap_flags |= HNS_ROCE_CQ_FLAG_RECORD_DB;
 		}
-		hr_cq->db_en = 1;
-		resp->cap_flags |= HNS_ROCE_SUPPORT_CQ_RECORD_DB;
+	} else {
+		if (has_db) {
+			err = hns_roce_alloc_db(hr_dev, &hr_cq->db, 1);
+			if (err)
+				return err;
+			hr_cq->set_ci_db = hr_cq->db.db_record;
+			*hr_cq->set_ci_db = 0;
+			hr_cq->flags |= HNS_ROCE_CQ_FLAG_RECORD_DB;
+		}
+		hr_cq->cq_db_l = hr_dev->reg_base + hr_dev->odb_offset +
+				 DB_REG_OFFSET * hr_dev->priv_uar.index;
 	}
 
 	return 0;
-
-err_mtt:
-	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
-	ib_umem_release(hr_cq->umem);
-
-	return ret;
 }
 
-static int create_kernel_cq(struct hns_roce_dev *hr_dev,
-			    struct hns_roce_cq *hr_cq, int cq_entries)
+static void free_cq_db(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
+		       struct ib_udata *udata)
 {
-	struct device *dev = hr_dev->dev;
-	struct hns_roce_uar *uar;
-	int ret;
+	struct hns_roce_ucontext *uctx;
 
-	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) {
-		ret = hns_roce_alloc_db(hr_dev, &hr_cq->db, 1);
-		if (ret)
-			return ret;
+	if (!(hr_cq->flags & HNS_ROCE_CQ_FLAG_RECORD_DB))
+		return;
 
-		hr_cq->set_ci_db = hr_cq->db.db_record;
-		*hr_cq->set_ci_db = 0;
-		hr_cq->db_en = 1;
-	}
-
-	/* Init mtt table and write buff address to mtt table */
-	ret = hns_roce_ib_alloc_cq_buf(hr_dev, &hr_cq->hr_buf, cq_entries);
-	if (ret) {
-		dev_err(dev, "Failed to alloc_cq_buf.\n");
-		goto err_db;
-	}
-
-	uar = &hr_dev->priv_uar;
-	hr_cq->cq_db_l = hr_dev->reg_base + hr_dev->odb_offset +
-			 DB_REG_OFFSET * uar->index;
-
-	return 0;
-
-err_db:
-	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)
+	hr_cq->flags &= ~HNS_ROCE_CQ_FLAG_RECORD_DB;
+	if (udata) {
+		uctx = rdma_udata_to_drv_context(udata,
+						 struct hns_roce_ucontext,
+						 ibucontext);
+		hns_roce_db_unmap_user(uctx, &hr_cq->db);
+	} else {
 		hns_roce_free_db(hr_dev, &hr_cq->db);
-
-	return ret;
+	}
 }
 
-static void destroy_user_cq(struct hns_roce_dev *hr_dev,
-			    struct hns_roce_cq *hr_cq,
-			    struct ib_udata *udata,
-			    struct hns_roce_ib_create_cq_resp *resp)
+static void set_cqe_size(struct hns_roce_cq *hr_cq, struct ib_udata *udata,
+			 struct hns_roce_ib_create_cq *ucmd)
 {
-	struct hns_roce_ucontext *context = rdma_udata_to_drv_context(
-				   udata, struct hns_roce_ucontext, ibucontext);
+	struct hns_roce_dev *hr_dev = to_hr_dev(hr_cq->ib_cq.device);
 
-	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
-	    (udata->outlen >= sizeof(*resp)))
-		hns_roce_db_unmap_user(context, &hr_cq->db);
-
-	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
-	ib_umem_release(hr_cq->umem);
+	if (udata) {
+		if (udata->inlen >= offsetofend(typeof(*ucmd), cqe_size))
+			hr_cq->cqe_size = ucmd->cqe_size;
+		else
+			hr_cq->cqe_size = HNS_ROCE_V2_CQE_SIZE;
+	} else {
+		hr_cq->cqe_size = hr_dev->caps.cqe_sz;
+	}
 }
 
-static void destroy_kernel_cq(struct hns_roce_dev *hr_dev,
-			      struct hns_roce_cq *hr_cq)
-{
-	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
-	hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, hr_cq->ib_cq.cqe);
-
-	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)
-		hns_roce_free_db(hr_dev, &hr_cq->db);
-}
-
-int hns_roce_ib_create_cq(struct ib_cq *ib_cq,
-			  const struct ib_cq_init_attr *attr,
-			  struct ib_udata *udata)
+int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
+		       struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
-	struct device *dev = hr_dev->dev;
 	struct hns_roce_ib_create_cq_resp resp = {};
 	struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_ib_create_cq ucmd = {};
 	int vector = attr->comp_vector;
-	int cq_entries = attr->cqe;
+	u32 cq_entries = attr->cqe;
 	int ret;
 
 	if (cq_entries < 1 || cq_entries > hr_dev->caps.max_cqes) {
-		dev_err(dev, "Creat CQ failed. entries=%d, max=%d\n",
-			cq_entries, hr_dev->caps.max_cqes);
+		ibdev_err(ibdev, "failed to check CQ count %u, max = %u.\n",
+			  cq_entries, hr_dev->caps.max_cqes);
 		return -EINVAL;
 	}
 
-	if (hr_dev->caps.min_cqes)
-		cq_entries = max(cq_entries, hr_dev->caps.min_cqes);
+	if (vector >= hr_dev->caps.num_comp_vectors) {
+		ibdev_err(ibdev, "failed to check CQ vector = %d, max = %d.\n",
+			  vector, hr_dev->caps.num_comp_vectors);
+		return -EINVAL;
+	}
 
-	cq_entries = roundup_pow_of_two((unsigned int)cq_entries);
-	hr_cq->ib_cq.cqe = cq_entries - 1;
+	cq_entries = max(cq_entries, hr_dev->caps.min_cqes);
+	cq_entries = roundup_pow_of_two(cq_entries);
+	hr_cq->ib_cq.cqe = cq_entries - 1; /* used as cqe index */
+	hr_cq->cq_depth = cq_entries;
+	hr_cq->vector = vector;
 	spin_lock_init(&hr_cq->lock);
+	INIT_LIST_HEAD(&hr_cq->sq_list);
+	INIT_LIST_HEAD(&hr_cq->rq_list);
 
 	if (udata) {
-		ret = create_user_cq(hr_dev, hr_cq, udata, &resp, cq_entries);
+		ret = ib_copy_from_udata(&ucmd, udata,
+					 min(udata->inlen, sizeof(ucmd)));
 		if (ret) {
-			dev_err(dev, "Create cq failed in user mode!\n");
-			goto err_cq;
-		}
-	} else {
-		ret = create_kernel_cq(hr_dev, hr_cq, cq_entries);
-		if (ret) {
-			dev_err(dev, "Create cq failed in kernel mode!\n");
-			goto err_cq;
+			ibdev_err(ibdev, "failed to copy CQ udata, ret = %d.\n",
+				  ret);
+			return ret;
 		}
 	}
 
-	/* Allocate cq index, fill cq_context */
-	ret = hns_roce_cq_alloc(hr_dev, cq_entries, &hr_cq->hr_buf.hr_mtt,
-				hr_cq, vector);
+	set_cqe_size(hr_cq, udata, &ucmd);
+
+	ret = alloc_cq_buf(hr_dev, hr_cq, udata, ucmd.buf_addr);
 	if (ret) {
-		dev_err(dev, "Creat CQ .Failed to cq_alloc.\n");
-		goto err_dbmap;
+		ibdev_err(ibdev, "failed to alloc CQ buf, ret = %d.\n", ret);
+		return ret;
+	}
+
+	ret = alloc_cq_db(hr_dev, hr_cq, udata, ucmd.db_addr, &resp);
+	if (ret) {
+		ibdev_err(ibdev, "failed to alloc CQ db, ret = %d.\n", ret);
+		goto err_cq_buf;
+	}
+
+	ret = alloc_cqc(hr_dev, hr_cq);
+	if (ret) {
+		ibdev_err(ibdev,
+			  "failed to alloc CQ context, ret = %d.\n", ret);
+		goto err_cq_db;
 	}
 
 	/*
 	 * For the QP created by kernel space, tptr value should be initialized
 	 * to zero; For the QP created by user space, it will cause synchronous
-	 * problems if tptr is set to zero here, so we initialze it in user
+	 * problems if tptr is set to zero here, so we initialize it in user
 	 * space.
 	 */
 	if (!udata && hr_cq->tptr_addr)
 		*hr_cq->tptr_addr = 0;
 
-	/* Get created cq handler and carry out event */
-	hr_cq->comp = hns_roce_ib_cq_comp;
-	hr_cq->event = hns_roce_ib_cq_event;
-	hr_cq->cq_depth = cq_entries;
-
 	if (udata) {
 		resp.cqn = hr_cq->cqn;
-		ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+		ret = ib_copy_to_udata(udata, &resp,
+				       min(udata->outlen, sizeof(resp)));
 		if (ret)
 			goto err_cqc;
 	}
@@ -477,81 +323,81 @@
 	return 0;
 
 err_cqc:
-	hns_roce_free_cq(hr_dev, hr_cq);
-
-err_dbmap:
-	if (udata)
-		destroy_user_cq(hr_dev, hr_cq, udata, &resp);
-	else
-		destroy_kernel_cq(hr_dev, hr_cq);
-
-err_cq:
+	free_cqc(hr_dev, hr_cq);
+err_cq_db:
+	free_cq_db(hr_dev, hr_cq, udata);
+err_cq_buf:
+	free_cq_buf(hr_dev, hr_cq);
 	return ret;
 }
 
-void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+int hns_roce_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
 	struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
 
-	if (hr_dev->hw->destroy_cq) {
+	if (hr_dev->hw->destroy_cq)
 		hr_dev->hw->destroy_cq(ib_cq, udata);
-		return;
-	}
 
-	hns_roce_free_cq(hr_dev, hr_cq);
-	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
-
-	ib_umem_release(hr_cq->umem);
-	if (udata) {
-		if (hr_cq->db_en == 1)
-			hns_roce_db_unmap_user(rdma_udata_to_drv_context(
-						       udata,
-						       struct hns_roce_ucontext,
-						       ibucontext),
-					       &hr_cq->db);
-	} else {
-		/* Free the buff of stored cq */
-		hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf, ib_cq->cqe);
-		if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB)
-			hns_roce_free_db(hr_dev, &hr_cq->db);
-	}
+	free_cq_buf(hr_dev, hr_cq);
+	free_cq_db(hr_dev, hr_cq, udata);
+	free_cqc(hr_dev, hr_cq);
+	return 0;
 }
 
 void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn)
 {
-	struct device *dev = hr_dev->dev;
-	struct hns_roce_cq *cq;
+	struct hns_roce_cq *hr_cq;
+	struct ib_cq *ibcq;
 
-	cq = xa_load(&hr_dev->cq_table.array, cqn & (hr_dev->caps.num_cqs - 1));
-	if (!cq) {
-		dev_warn(dev, "Completion event for bogus CQ 0x%08x\n", cqn);
+	hr_cq = xa_load(&hr_dev->cq_table.array,
+			cqn & (hr_dev->caps.num_cqs - 1));
+	if (!hr_cq) {
+		dev_warn(hr_dev->dev, "Completion event for bogus CQ 0x%06x\n",
+			 cqn);
 		return;
 	}
 
-	++cq->arm_sn;
-	cq->comp(cq);
+	++hr_cq->arm_sn;
+	ibcq = &hr_cq->ib_cq;
+	if (ibcq->comp_handler)
+		ibcq->comp_handler(ibcq, ibcq->cq_context);
 }
 
 void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type)
 {
-	struct hns_roce_cq_table *cq_table = &hr_dev->cq_table;
 	struct device *dev = hr_dev->dev;
-	struct hns_roce_cq *cq;
+	struct hns_roce_cq *hr_cq;
+	struct ib_event event;
+	struct ib_cq *ibcq;
 
-	cq = xa_load(&cq_table->array, cqn & (hr_dev->caps.num_cqs - 1));
-	if (cq)
-		atomic_inc(&cq->refcount);
-
-	if (!cq) {
-		dev_warn(dev, "Async event for bogus CQ %08x\n", cqn);
+	hr_cq = xa_load(&hr_dev->cq_table.array,
+			cqn & (hr_dev->caps.num_cqs - 1));
+	if (!hr_cq) {
+		dev_warn(dev, "Async event for bogus CQ 0x%06x\n", cqn);
 		return;
 	}
 
-	cq->event(cq, (enum hns_roce_event)event_type);
+	if (event_type != HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID &&
+	    event_type != HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR &&
+	    event_type != HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW) {
+		dev_err(dev, "Unexpected event type 0x%x on CQ 0x%06x\n",
+			event_type, cqn);
+		return;
+	}
 
-	if (atomic_dec_and_test(&cq->refcount))
-		complete(&cq->free);
+	atomic_inc(&hr_cq->refcount);
+
+	ibcq = &hr_cq->ib_cq;
+	if (ibcq->event_handler) {
+		event.device = ibcq->device;
+		event.element.cq = ibcq;
+		event.event = IB_EVENT_CQ_ERR;
+		ibcq->event_handler(&event, ibcq->cq_context);
+	}
+
+	if (atomic_dec_and_test(&hr_cq->refcount))
+		complete(&hr_cq->free);
 }
 
 int hns_roce_init_cq_table(struct hns_roce_dev *hr_dev)
diff --git a/drivers/infiniband/hw/hns/hns_roce_db.c b/drivers/infiniband/hw/hns/hns_roce_db.c
index c00714c..bff6abd 100644
--- a/drivers/infiniband/hw/hns/hns_roce_db.c
+++ b/drivers/infiniband/hw/hns/hns_roce_db.c
@@ -31,7 +31,8 @@
 
 	refcount_set(&page->refcount, 1);
 	page->user_virt = page_addr;
-	page->umem = ib_umem_get(udata, page_addr, PAGE_SIZE, 0, 0);
+	page->umem = ib_umem_get(context->ibucontext.device, page_addr,
+				 PAGE_SIZE, 0);
 	if (IS_ERR(page->umem)) {
 		ret = PTR_ERR(page->umem);
 		kfree(page);
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h
index 3e68ba9..d9aa742 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -37,24 +37,19 @@
 
 #define DRV_NAME "hns_roce"
 
-/* hip08 is a pci device, it includes two version according pci version id */
-#define PCI_REVISION_ID_HIP08_A			0x20
-#define PCI_REVISION_ID_HIP08_B			0x21
+#define PCI_REVISION_ID_HIP08			0x21
+#define PCI_REVISION_ID_HIP09			0x30
 
 #define HNS_ROCE_HW_VER1	('h' << 24 | 'i' << 16 | '0' << 8 | '6')
 
 #define HNS_ROCE_MAX_MSG_LEN			0x80000000
 
-#define HNS_ROCE_ALOGN_UP(a, b) ((((a) + (b) - 1) / (b)) * (b))
-
 #define HNS_ROCE_IB_MIN_SQ_STRIDE		6
 
 #define HNS_ROCE_BA_SIZE			(32 * 4096)
 
 #define BA_BYTE_LEN				8
 
-#define BITS_PER_BYTE				8
-
 /* Hardware specification only for v1 engine */
 #define HNS_ROCE_MIN_CQE_NUM			0x40
 #define HNS_ROCE_MIN_WQE_NUM			0x20
@@ -62,7 +57,6 @@
 /* Hardware specification only for v1 engine */
 #define HNS_ROCE_MAX_INNER_MTPT_NUM		0x7
 #define HNS_ROCE_MAX_MTPT_PBL_NUM		0x100000
-#define HNS_ROCE_MAX_SGE_NUM			2
 
 #define HNS_ROCE_EACH_FREE_CQ_WAIT_MSECS	20
 #define HNS_ROCE_MAX_FREE_CQ_WAIT_CNT	\
@@ -81,15 +75,18 @@
 #define HNS_ROCE_CEQ				0
 #define HNS_ROCE_AEQ				1
 
-#define HNS_ROCE_CEQ_ENTRY_SIZE			0x4
-#define HNS_ROCE_AEQ_ENTRY_SIZE			0x10
+#define HNS_ROCE_CEQE_SIZE 0x4
+#define HNS_ROCE_AEQE_SIZE 0x10
 
-#define HNS_ROCE_SL_SHIFT			28
-#define HNS_ROCE_TCLASS_SHIFT			20
-#define HNS_ROCE_FLOW_LABEL_MASK		0xfffff
+#define HNS_ROCE_V3_EQE_SIZE 0x40
+
+#define HNS_ROCE_V2_CQE_SIZE 32
+#define HNS_ROCE_V3_CQE_SIZE 64
+
+#define HNS_ROCE_V2_QPC_SZ 256
+#define HNS_ROCE_V3_QPC_SZ 512
 
 #define HNS_ROCE_MAX_PORTS			6
-#define HNS_ROCE_MAX_GID_NUM			16
 #define HNS_ROCE_GID_SIZE			16
 #define HNS_ROCE_SGE_SIZE			16
 
@@ -109,11 +106,6 @@
 #define NODE_DESC_SIZE				64
 #define DB_REG_OFFSET				0x1000
 
-#define SERV_TYPE_RC				0
-#define SERV_TYPE_RD				1
-#define SERV_TYPE_UC				2
-#define SERV_TYPE_UD				3
-
 /* Configure to HW for PAGE_SIZE larger than 4KB */
 #define PG_SHIFT_OFFSET				(PAGE_SHIFT - 12)
 
@@ -122,8 +114,6 @@
 #define PAGES_SHIFT_24				24
 #define PAGES_SHIFT_32				32
 
-#define HNS_ROCE_PCI_BAR_NUM			2
-
 #define HNS_ROCE_IDX_QUE_ENTRY_SZ		4
 #define SRQ_DB_REG				0x230
 
@@ -133,12 +123,19 @@
 #define EQ_DEPTH_COEFF				2
 
 enum {
-	HNS_ROCE_SUPPORT_RQ_RECORD_DB = 1 << 0,
-	HNS_ROCE_SUPPORT_SQ_RECORD_DB = 1 << 1,
+	SERV_TYPE_RC,
+	SERV_TYPE_UC,
+	SERV_TYPE_RD,
+	SERV_TYPE_UD,
 };
 
 enum {
-	HNS_ROCE_SUPPORT_CQ_RECORD_DB = 1 << 0,
+	HNS_ROCE_QP_CAP_RQ_RECORD_DB = BIT(0),
+	HNS_ROCE_QP_CAP_SQ_RECORD_DB = BIT(1),
+};
+
+enum hns_roce_cq_flags {
+	HNS_ROCE_CQ_FLAG_RECORD_DB = BIT(0),
 };
 
 enum hns_roce_qp_state {
@@ -211,6 +208,8 @@
 	HNS_ROCE_OPCODE_RDMA_WITH_IMM_RECEIVE	= 0x07,
 };
 
+#define HNS_ROCE_CAP_FLAGS_EX_SHIFT 12
+
 enum {
 	HNS_ROCE_CAP_FLAG_REREG_MR		= BIT(0),
 	HNS_ROCE_CAP_FLAG_ROCE_V1_V2		= BIT(1),
@@ -224,13 +223,6 @@
 	HNS_ROCE_CAP_FLAG_ATOMIC		= BIT(10),
 };
 
-enum hns_roce_mtt_type {
-	MTT_TYPE_WQE,
-	MTT_TYPE_CQE,
-	MTT_TYPE_SRQWQE,
-	MTT_TYPE_IDX
-};
-
 #define HNS_ROCE_DB_TYPE_COUNT			2
 #define HNS_ROCE_DB_UNIT_SIZE			4
 
@@ -269,9 +261,12 @@
 #define HNS_ROCE_PORT_DOWN			0
 #define HNS_ROCE_PORT_UP			1
 
-#define HNS_ROCE_MTT_ENTRY_PER_SEG		8
+/* The minimum page size is 4K for hardware */
+#define HNS_HW_PAGE_SHIFT			12
+#define HNS_HW_PAGE_SIZE			(1 << HNS_HW_PAGE_SHIFT)
 
-#define PAGE_ADDR_SHIFT				12
+/* The minimum page count for hardware access page directly. */
+#define HNS_HW_DIRECT_PAGE_COUNT 2
 
 struct hns_roce_uar {
 	u64		pfn;
@@ -302,22 +297,6 @@
 	unsigned long		*table;
 };
 
-/* Order bitmap length -- bit num compute formula: 1 << (max_order - order) */
-/* Order = 0: bitmap is biggest, order = max bitmap is least (only a bit) */
-/* Every bit repesent to a partner free/used status in bitmap */
-/*
- * Initial, bits of other bitmap are all 0 except that a bit of max_order is 1
- * Bit = 1 represent to idle and available; bit = 0: not available
- */
-struct hns_roce_buddy {
-	/* Members point to every order level bitmap */
-	unsigned long **bits;
-	/* Represent to avail bits of the order level bitmap */
-	u32            *num_free;
-	int             max_order;
-	spinlock_t      lock;
-};
-
 /* For Hardware Entry Memory */
 struct hns_roce_hem_table {
 	/* HEM type: 0 = qpc, 1 = mtt, 2 = cqc, 3 = srq, 4 = other */
@@ -338,13 +317,6 @@
 	dma_addr_t	*bt_l0_dma_addr;
 };
 
-struct hns_roce_mtt {
-	unsigned long		first_seg;
-	int			order;
-	int			page_shift;
-	enum hns_roce_mtt_type	mtt_type;
-};
-
 struct hns_roce_buf_region {
 	int offset; /* page offset */
 	u32 count; /* page count */
@@ -359,13 +331,36 @@
 	struct list_head mid_bt[HNS_ROCE_MAX_BT_REGION][HNS_ROCE_MAX_BT_LEVEL];
 	struct list_head btm_bt; /* link all bottom bt in @mid_bt */
 	dma_addr_t root_ba; /* pointer to the root ba table */
-	int bt_pg_shift;
+};
+
+struct hns_roce_buf_attr {
+	struct {
+		size_t	size;  /* region size */
+		int	hopnum; /* multi-hop addressing hop num */
+	} region[HNS_ROCE_MAX_BT_REGION];
+	int region_count; /* valid region count */
+	unsigned int page_shift;  /* buffer page shift */
+	bool fixed_page; /* decide page shift is fixed-size or maximum size */
+	int user_access; /* umem access flag */
+	bool mtt_only; /* only alloc buffer-required MTT memory */
+};
+
+struct hns_roce_hem_cfg {
+	dma_addr_t	root_ba; /* root BA table's address */
+	bool		is_direct; /* addressing without BA table */
+	unsigned int	ba_pg_shift; /* BA table page shift */
+	unsigned int	buf_pg_shift; /* buffer page shift */
+	unsigned int	buf_pg_count;  /* buffer page count */
+	struct hns_roce_buf_region region[HNS_ROCE_MAX_BT_REGION];
+	int		region_count;
 };
 
 /* memory translate region */
 struct hns_roce_mtr {
-	struct hns_roce_hem_list hem_list;
-	int buf_pg_shift;
+	struct hns_roce_hem_list hem_list; /* multi-hop addressing resource */
+	struct ib_umem		*umem; /* user space buffer */
+	struct hns_roce_buf	*kmem; /* kernel space buffer */
+	struct hns_roce_hem_cfg  hem_cfg; /* config for hardware addressing */
 };
 
 struct hns_roce_mw {
@@ -383,50 +378,28 @@
 
 struct hns_roce_mr {
 	struct ib_mr		ibmr;
-	struct ib_umem		*umem;
 	u64			iova; /* MR's virtual orignal addr */
 	u64			size; /* Address range of MR */
 	u32			key; /* Key of MR */
 	u32			pd;   /* PD num of MR */
 	u32			access;	/* Access permission of MR */
-	u32			npages;
 	int			enabled; /* MR's active status */
 	int			type;	/* MR's register type */
-	u64			*pbl_buf;	/* MR's PBL space */
-	dma_addr_t		pbl_dma_addr;	/* MR's PBL space PA */
-	u32			pbl_size;	/* PA number in the PBL */
-	u64			pbl_ba;		/* page table address */
-	u32			l0_chunk_last_num;	/* L0 last number */
-	u32			l1_chunk_last_num;	/* L1 last number */
-	u64			**pbl_bt_l2;	/* PBL BT L2 */
-	u64			**pbl_bt_l1;	/* PBL BT L1 */
-	u64			*pbl_bt_l0;	/* PBL BT L0 */
-	dma_addr_t		*pbl_l2_dma_addr;	/* PBL BT L2 dma addr */
-	dma_addr_t		*pbl_l1_dma_addr;	/* PBL BT L1 dma addr */
-	dma_addr_t		pbl_l0_dma_addr;	/* PBL BT L0 dma addr */
-	u32			pbl_ba_pg_sz;	/* BT chunk page size */
-	u32			pbl_buf_pg_sz;	/* buf chunk page size */
 	u32			pbl_hop_num;	/* multi-hop number */
+	struct hns_roce_mtr	pbl_mtr;
+	u32			npages;
+	dma_addr_t		*page_list;
 };
 
 struct hns_roce_mr_table {
 	struct hns_roce_bitmap		mtpt_bitmap;
-	struct hns_roce_buddy		mtt_buddy;
-	struct hns_roce_hem_table	mtt_table;
 	struct hns_roce_hem_table	mtpt_table;
-	struct hns_roce_buddy		mtt_cqe_buddy;
-	struct hns_roce_hem_table	mtt_cqe_table;
-	struct hns_roce_buddy		mtt_srqwqe_buddy;
-	struct hns_roce_hem_table	mtt_srqwqe_table;
-	struct hns_roce_buddy		mtt_idx_buddy;
-	struct hns_roce_hem_table	mtt_idx_table;
 };
 
 struct hns_roce_wq {
 	u64		*wrid;     /* Work request ID */
 	spinlock_t	lock;
 	u32		wqe_cnt;  /* WQE num */
-	u32		max_post;
 	int		max_gs;
 	int		offset;
 	int		wqe_shift;	/* WQE size */
@@ -436,7 +409,7 @@
 };
 
 struct hns_roce_sge {
-	int		sge_cnt;	/* SGE num */
+	unsigned int	sge_cnt;	/* SGE num */
 	int		offset;
 	int		sge_shift;	/* SGE size */
 };
@@ -449,9 +422,9 @@
 struct hns_roce_buf {
 	struct hns_roce_buf_list	direct;
 	struct hns_roce_buf_list	*page_list;
-	int				nbufs;
 	u32				npages;
-	int				page_shift;
+	u32				size;
+	unsigned int			page_shift;
 };
 
 struct hns_roce_db_pgdir {
@@ -482,48 +455,39 @@
 	int		order;
 };
 
-struct hns_roce_cq_buf {
-	struct hns_roce_buf hr_buf;
-	struct hns_roce_mtt hr_mtt;
-};
-
 struct hns_roce_cq {
 	struct ib_cq			ib_cq;
-	struct hns_roce_cq_buf		hr_buf;
+	struct hns_roce_mtr		mtr;
 	struct hns_roce_db		db;
-	u8				db_en;
+	u32				flags;
 	spinlock_t			lock;
-	struct ib_umem			*umem;
-	void (*comp)(struct hns_roce_cq *cq);
-	void (*event)(struct hns_roce_cq *cq, enum hns_roce_event event_type);
-
-	struct hns_roce_uar		*uar;
 	u32				cq_depth;
 	u32				cons_index;
 	u32				*set_ci_db;
 	void __iomem			*cq_db_l;
 	u16				*tptr_addr;
 	int				arm_sn;
+	int				cqe_size;
 	unsigned long			cqn;
 	u32				vector;
 	atomic_t			refcount;
 	struct completion		free;
+	struct list_head		sq_list; /* all qps on this send cq */
+	struct list_head		rq_list; /* all qps on this recv cq */
+	int				is_armed; /* cq is armed */
+	struct list_head		node; /* all armed cqs are on a list */
 };
 
 struct hns_roce_idx_que {
-	struct hns_roce_buf		idx_buf;
-	int				entry_sz;
-	u32				buf_size;
-	struct ib_umem			*umem;
-	struct hns_roce_mtt		mtt;
+	struct hns_roce_mtr		mtr;
+	int				entry_shift;
 	unsigned long			*bitmap;
 };
 
 struct hns_roce_srq {
 	struct ib_srq		ibsrq;
-	void (*event)(struct hns_roce_srq *srq, enum hns_roce_event event);
 	unsigned long		srqn;
-	int			max;
+	u32			wqe_cnt;
 	int			max_gs;
 	int			wqe_shift;
 	void __iomem		*db_reg_l;
@@ -531,16 +495,15 @@
 	atomic_t		refcount;
 	struct completion	free;
 
-	struct hns_roce_buf	buf;
+	struct hns_roce_mtr	buf_mtr;
+
 	u64		       *wrid;
-	struct ib_umem	       *umem;
-	struct hns_roce_mtt	mtt;
 	struct hns_roce_idx_que idx_que;
 	spinlock_t		lock;
 	int			head;
 	int			tail;
-	u16			wqe_ctr;
 	struct mutex		mutex;
+	void (*event)(struct hns_roce_srq *srq, enum hns_roce_event event);
 };
 
 struct hns_roce_uar_table {
@@ -573,17 +536,18 @@
 };
 
 struct hns_roce_av {
-	u8          port;
-	u8          gid_index;
-	u8          stat_rate;
-	u8          hop_limit;
-	u32         flowlabel;
-	u8          sl;
-	u8          tclass;
-	u8          dgid[HNS_ROCE_GID_SIZE];
-	u8          mac[ETH_ALEN];
-	u16         vlan;
-	bool	    vlan_en;
+	u8 port;
+	u8 gid_index;
+	u8 stat_rate;
+	u8 hop_limit;
+	u32 flowlabel;
+	u16 udp_sport;
+	u8 sl;
+	u8 tclass;
+	u8 dgid[HNS_ROCE_GID_SIZE];
+	u8 mac[ETH_ALEN];
+	u16 vlan_id;
+	u8 vlan_en;
 };
 
 struct hns_roce_ah {
@@ -648,28 +612,31 @@
 	u32			 wqe_cnt;
 };
 
+enum {
+	HNS_ROCE_FLUSH_FLAG = 0,
+};
+
+struct hns_roce_work {
+	struct hns_roce_dev *hr_dev;
+	struct work_struct work;
+	u32 qpn;
+	u32 cqn;
+	int event_type;
+	int sub_type;
+};
+
 struct hns_roce_qp {
 	struct ib_qp		ibqp;
-	struct hns_roce_buf	hr_buf;
 	struct hns_roce_wq	rq;
 	struct hns_roce_db	rdb;
 	struct hns_roce_db	sdb;
-	u8			rdb_en;
-	u8			sdb_en;
+	unsigned long		en_flags;
 	u32			doorbell_qpn;
 	enum ib_sig_type	sq_signal_bits;
 	struct hns_roce_wq	sq;
 
-	struct ib_umem		*umem;
-	struct hns_roce_mtt	mtt;
 	struct hns_roce_mtr	mtr;
 
-	/* this define must less than HNS_ROCE_MAX_BT_REGION */
-#define HNS_ROCE_WQE_REGION_MAX	 3
-	struct hns_roce_buf_region regions[HNS_ROCE_WQE_REGION_MAX];
-	int			region_cnt;
-	int                     wqe_bt_pg_shift;
-
 	u32			buff_size;
 	struct mutex		mutex;
 	u8			port;
@@ -690,12 +657,16 @@
 
 	struct hns_roce_sge	sge;
 	u32			next_sge;
+	enum ib_mtu		path_mtu;
+	u32			max_inline_data;
 
+	/* 0: flush needed, 1: unneeded */
+	unsigned long		flush_flag;
+	struct hns_roce_work	flush_work;
 	struct hns_roce_rinl_buf rq_inl_buf;
-};
-
-struct hns_roce_sqp {
-	struct hns_roce_qp	hr_qp;
+	struct list_head	node;		/* all qps are on a list */
+	struct list_head	rq_node;	/* all recv qps are on a list */
+	struct list_head	sq_node;	/* all send qps are on a list */
 };
 
 struct hns_roce_ib_iboe {
@@ -711,7 +682,8 @@
 };
 
 struct hns_roce_ceqe {
-	__le32			comp;
+	__le32	comp;
+	__le32	rsv[15];
 };
 
 struct hns_roce_aeqe {
@@ -748,6 +720,7 @@
 			u8	rsv0;
 		} __packed cmd;
 	 } event;
+	__le32 rsv[12];
 };
 
 struct hns_roce_eq {
@@ -766,23 +739,11 @@
 	int				over_ignore;
 	int				coalesce;
 	int				arm_st;
-	u64				eqe_ba;
-	int				eqe_ba_pg_sz;
-	int				eqe_buf_pg_sz;
 	int				hop_num;
-	u64				*bt_l0;	/* Base address table for L0 */
-	u64				**bt_l1; /* Base address table for L1 */
-	u64				**buf;
-	dma_addr_t			l0_dma;
-	dma_addr_t			*l1_dma;
-	dma_addr_t			*buf_dma;
-	u32				l0_last_num; /* L0 last chunk num */
-	u32				l1_last_num; /* L1 last chunk num */
-	int				eq_max_cnt;
+	struct hns_roce_mtr		mtr;
+	u16				eq_max_cnt;
 	int				eq_period;
 	int				shift;
-	dma_addr_t			cur_eqe_ba;
-	dma_addr_t			nxt_eqe_ba;
 	int				event_type;
 	int				sub_type;
 };
@@ -808,10 +769,8 @@
 	int             reserved_qps;
 	int		num_qpc_timer;
 	int		num_cqc_timer;
-	u32		max_srq_sg;
 	int		num_srqs;
 	u32		max_wqes;
-	u32		max_srqs;
 	u32		max_srq_wrs;
 	u32		max_srq_sges;
 	u32		max_sq_desc_sz;
@@ -820,12 +779,11 @@
 	int		max_qp_init_rdma;
 	int		max_qp_dest_rdma;
 	int		num_cqs;
-	int		max_cqes;
-	int		min_cqes;
+	u32		max_cqes;
+	u32		min_cqes;
 	u32		min_wqes;
 	int		reserved_cqs;
 	int		reserved_srqs;
-	u32		max_srqwqes;
 	int		num_aeq_vectors;
 	int		num_comp_vectors;
 	int		num_other_vectors;
@@ -839,15 +797,15 @@
 	int		num_pds;
 	int		reserved_pds;
 	u32		mtt_entry_sz;
-	u32		cq_entry_sz;
+	u32		cqe_sz;
 	u32		page_size_cap;
 	u32		reserved_lkey;
 	int		mtpt_entry_sz;
-	int		qpc_entry_sz;
+	int		qpc_sz;
 	int		irrl_entry_sz;
 	int		trrl_entry_sz;
 	int		cqc_entry_sz;
-	int		sccc_entry_sz;
+	int		sccc_sz;
 	int		qpc_timer_entry_sz;
 	int		cqc_timer_entry_sz;
 	int		srqc_entry_sz;
@@ -857,6 +815,8 @@
 	u32		pbl_hop_num;
 	int		aeqe_depth;
 	int		ceqe_depth;
+	u32		aeqe_size;
+	u32		ceqe_size;
 	enum ib_mtu	max_mtu;
 	u32		qpc_bt_num;
 	u32		qpc_timer_bt_num;
@@ -892,7 +852,7 @@
 	u32		cqc_timer_ba_pg_sz;
 	u32		cqc_timer_buf_pg_sz;
 	u32		cqc_timer_hop_num;
-	u32		cqe_ba_pg_sz;
+	u32             cqe_ba_pg_sz;	/* page_size = 4K*(2^cqe_ba_pg_sz) */
 	u32		cqe_buf_pg_sz;
 	u32		cqe_hop_num;
 	u32		srqwqe_ba_pg_sz;
@@ -909,15 +869,12 @@
 	u32		tpq_buf_pg_sz;
 	u32		chunk_sz;	/* chunk size in non multihop mode */
 	u64		flags;
-};
-
-struct hns_roce_work {
-	struct hns_roce_dev *hr_dev;
-	struct work_struct work;
-	u32 qpn;
-	u32 cqn;
-	int event_type;
-	int sub_type;
+	u16		default_ceq_max_cnt;
+	u16		default_ceq_period;
+	u16		default_aeq_max_cnt;
+	u16		default_aeq_period;
+	u16		default_aeq_arm_st;
+	u16		default_ceq_arm_st;
 };
 
 struct hns_roce_dfx_hw {
@@ -925,6 +882,12 @@
 			      int *buffer);
 };
 
+enum hns_roce_device_state {
+	HNS_ROCE_DEVICE_STATE_INITED,
+	HNS_ROCE_DEVICE_STATE_RST_DOWN,
+	HNS_ROCE_DEVICE_STATE_UNINIT,
+};
+
 struct hns_roce_hw {
 	int (*reset)(struct hns_roce_dev *hr_dev, bool enable);
 	int (*cmq_init)(struct hns_roce_dev *hr_dev);
@@ -942,17 +905,18 @@
 	int (*set_mac)(struct hns_roce_dev *hr_dev, u8 phy_port, u8 *addr);
 	void (*set_mtu)(struct hns_roce_dev *hr_dev, u8 phy_port,
 			enum ib_mtu mtu);
-	int (*write_mtpt)(void *mb_buf, struct hns_roce_mr *mr,
-			  unsigned long mtpt_idx);
+	int (*write_mtpt)(struct hns_roce_dev *hr_dev, void *mb_buf,
+			  struct hns_roce_mr *mr, unsigned long mtpt_idx);
 	int (*rereg_write_mtpt)(struct hns_roce_dev *hr_dev,
 				struct hns_roce_mr *mr, int flags, u32 pdn,
 				int mr_access_flags, u64 iova, u64 size,
 				void *mb_buf);
-	int (*frmr_write_mtpt)(void *mb_buf, struct hns_roce_mr *mr);
+	int (*frmr_write_mtpt)(struct hns_roce_dev *hr_dev, void *mb_buf,
+			       struct hns_roce_mr *mr);
 	int (*mw_write_mtpt)(void *mb_buf, struct hns_roce_mw *mw);
 	void (*write_cqc)(struct hns_roce_dev *hr_dev,
 			  struct hns_roce_cq *hr_cq, void *mb_buf, u64 *mtts,
-			  dma_addr_t dma_handle, int nent, u32 vector);
+			  dma_addr_t dma_handle);
 	int (*set_hem)(struct hns_roce_dev *hr_dev,
 		       struct hns_roce_hem_table *table, int obj, int step_idx);
 	int (*clear_hem)(struct hns_roce_dev *hr_dev,
@@ -974,7 +938,7 @@
 	int (*poll_cq)(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 	int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr,
 			struct ib_udata *udata);
-	void (*destroy_cq)(struct ib_cq *ibcq, struct ib_udata *udata);
+	int (*destroy_cq)(struct ib_cq *ibcq, struct ib_udata *udata);
 	int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 	int (*init_eq)(struct hns_roce_dev *hr_dev);
 	void (*cleanup_eq)(struct hns_roce_dev *hr_dev);
@@ -1007,6 +971,9 @@
 	bool			dis_db;
 	unsigned long		reset_cnt;
 	struct hns_roce_ib_iboe iboe;
+	enum hns_roce_device_state state;
+	struct list_head	qp_list; /* list of all qps on this dev */
+	spinlock_t		qp_list_lock; /* protect qp_list */
 
 	struct list_head        pgdir_list;
 	struct mutex            pgdir_mutex;
@@ -1091,11 +1058,6 @@
 	return container_of(ibsrq, struct hns_roce_srq, ibsrq);
 }
 
-static inline struct hns_roce_sqp *hr_to_hr_sqp(struct hns_roce_qp *hr_qp)
-{
-	return container_of(hr_qp, struct hns_roce_sqp, hr_qp);
-}
-
 static inline void hns_roce_write64_k(__le32 val[2], void __iomem *dest)
 {
 	__raw_writeq(*(u64 *) val, dest);
@@ -1107,15 +1069,75 @@
 	return xa_load(&hr_dev->qp_table_xa, qpn & (hr_dev->caps.num_qps - 1));
 }
 
+static inline bool hns_roce_buf_is_direct(struct hns_roce_buf *buf)
+{
+	if (buf->page_list)
+		return false;
+
+	return true;
+}
+
 static inline void *hns_roce_buf_offset(struct hns_roce_buf *buf, int offset)
 {
-	u32 page_size = 1 << buf->page_shift;
+	if (hns_roce_buf_is_direct(buf))
+		return (char *)(buf->direct.buf) + (offset & (buf->size - 1));
 
-	if (buf->nbufs == 1)
-		return (char *)(buf->direct.buf) + offset;
+	return (char *)(buf->page_list[offset >> buf->page_shift].buf) +
+	       (offset & ((1 << buf->page_shift) - 1));
+}
+
+static inline dma_addr_t hns_roce_buf_page(struct hns_roce_buf *buf, int idx)
+{
+	if (hns_roce_buf_is_direct(buf))
+		return buf->direct.map + ((dma_addr_t)idx << buf->page_shift);
 	else
-		return (char *)(buf->page_list[offset >> buf->page_shift].buf) +
-		       (offset & (page_size - 1));
+		return buf->page_list[idx].map;
+}
+
+#define hr_hw_page_align(x)		ALIGN(x, 1 << HNS_HW_PAGE_SHIFT)
+
+static inline u64 to_hr_hw_page_addr(u64 addr)
+{
+	return addr >> HNS_HW_PAGE_SHIFT;
+}
+
+static inline u32 to_hr_hw_page_shift(u32 page_shift)
+{
+	return page_shift - HNS_HW_PAGE_SHIFT;
+}
+
+static inline u32 to_hr_hem_hopnum(u32 hopnum, u32 count)
+{
+	if (count > 0)
+		return hopnum == HNS_ROCE_HOP_NUM_0 ? 0 : hopnum;
+
+	return 0;
+}
+
+static inline u32 to_hr_hem_entries_size(u32 count, u32 buf_shift)
+{
+	return hr_hw_page_align(count << buf_shift);
+}
+
+static inline u32 to_hr_hem_entries_count(u32 count, u32 buf_shift)
+{
+	return hr_hw_page_align(count << buf_shift) >> buf_shift;
+}
+
+static inline u32 to_hr_hem_entries_shift(u32 count, u32 buf_shift)
+{
+	if (!count)
+		return 0;
+
+	return ilog2(to_hr_hem_entries_count(count, buf_shift));
+}
+
+#define DSCP_SHIFT 2
+
+static inline u8 get_tclass(const struct ib_global_route *grh)
+{
+	return grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP ?
+	       grh->traffic_class >> DSCP_SHIFT : grh->traffic_class;
 }
 
 int hns_roce_init_uar_table(struct hns_roce_dev *dev);
@@ -1130,29 +1152,21 @@
 int hns_roce_cmd_use_events(struct hns_roce_dev *hr_dev);
 void hns_roce_cmd_use_polling(struct hns_roce_dev *hr_dev);
 
-int hns_roce_mtt_init(struct hns_roce_dev *hr_dev, int npages, int page_shift,
-		      struct hns_roce_mtt *mtt);
-void hns_roce_mtt_cleanup(struct hns_roce_dev *hr_dev,
-			  struct hns_roce_mtt *mtt);
-int hns_roce_buf_write_mtt(struct hns_roce_dev *hr_dev,
-			   struct hns_roce_mtt *mtt, struct hns_roce_buf *buf);
-
-void hns_roce_mtr_init(struct hns_roce_mtr *mtr, int bt_pg_shift,
-		       int buf_pg_shift);
-int hns_roce_mtr_attach(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
-			dma_addr_t **bufs, struct hns_roce_buf_region *regions,
-			int region_cnt);
-void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev,
-			  struct hns_roce_mtr *mtr);
-
 /* hns roce hw need current block and next block addr from mtt */
 #define MTT_MIN_COUNT	 2
 int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
 		      int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr);
+int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+			struct hns_roce_buf_attr *buf_attr,
+			unsigned int page_shift, struct ib_udata *udata,
+			unsigned long user_addr);
+void hns_roce_mtr_destroy(struct hns_roce_dev *hr_dev,
+			  struct hns_roce_mtr *mtr);
+int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+		     dma_addr_t *pages, int page_cnt);
 
 int hns_roce_init_pd_table(struct hns_roce_dev *hr_dev);
 int hns_roce_init_mr_table(struct hns_roce_dev *hr_dev);
-int hns_roce_init_eq_table(struct hns_roce_dev *hr_dev);
 int hns_roce_init_cq_table(struct hns_roce_dev *hr_dev);
 int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev);
 int hns_roce_init_srq_table(struct hns_roce_dev *hr_dev);
@@ -1177,13 +1191,16 @@
 				unsigned long obj, int cnt,
 				int rr);
 
-int hns_roce_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr,
-		       u32 flags, struct ib_udata *udata);
+int hns_roce_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr,
+		       struct ib_udata *udata);
 int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
-void hns_roce_destroy_ah(struct ib_ah *ah, u32 flags);
+static inline int hns_roce_destroy_ah(struct ib_ah *ah, u32 flags)
+{
+	return 0;
+}
 
 int hns_roce_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
-void hns_roce_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+int hns_roce_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 
 struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc);
 struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
@@ -1193,38 +1210,27 @@
 			   u64 virt_addr, int mr_access_flags, struct ib_pd *pd,
 			   struct ib_udata *udata);
 struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-				u32 max_num_sg, struct ib_udata *udata);
+				u32 max_num_sg);
 int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		       unsigned int *sg_offset);
 int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
-int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev,
-		       struct hns_roce_cmd_mailbox *mailbox,
-		       unsigned long mpt_index);
+int hns_roce_hw_destroy_mpt(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_cmd_mailbox *mailbox,
+			    unsigned long mpt_index);
 unsigned long key_to_hw_index(u32 key);
 
-struct ib_mw *hns_roce_alloc_mw(struct ib_pd *pd, enum ib_mw_type,
-				struct ib_udata *udata);
+int hns_roce_alloc_mw(struct ib_mw *mw, struct ib_udata *udata);
 int hns_roce_dealloc_mw(struct ib_mw *ibmw);
 
-void hns_roce_buf_free(struct hns_roce_dev *hr_dev, u32 size,
-		       struct hns_roce_buf *buf);
+void hns_roce_buf_free(struct hns_roce_dev *hr_dev, struct hns_roce_buf *buf);
 int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct,
 		       struct hns_roce_buf *buf, u32 page_shift);
 
-int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev,
-			       struct hns_roce_mtt *mtt, struct ib_umem *umem);
-
-void hns_roce_init_buf_region(struct hns_roce_buf_region *region, int hopnum,
-			      int offset, int buf_cnt);
-int hns_roce_alloc_buf_list(struct hns_roce_buf_region *regions,
-			    dma_addr_t **bufs, int count);
-void hns_roce_free_buf_list(dma_addr_t **bufs, int count);
-
 int hns_roce_get_kmem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs,
 			   int buf_cnt, int start, struct hns_roce_buf *buf);
 int hns_roce_get_umem_bufs(struct hns_roce_dev *hr_dev, dma_addr_t *bufs,
 			   int buf_cnt, int start, struct ib_umem *umem,
-			   int page_shift);
+			   unsigned int page_shift);
 
 int hns_roce_create_srq(struct ib_srq *srq,
 			struct ib_srq_init_attr *srq_init_attr,
@@ -1232,16 +1238,17 @@
 int hns_roce_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr,
 			enum ib_srq_attr_mask srq_attr_mask,
 			struct ib_udata *udata);
-void hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
+int hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
 
 struct ib_qp *hns_roce_create_qp(struct ib_pd *ib_pd,
 				 struct ib_qp_init_attr *init_attr,
 				 struct ib_udata *udata);
 int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		       int attr_mask, struct ib_udata *udata);
-void *get_recv_wqe(struct hns_roce_qp *hr_qp, int n);
-void *get_send_wqe(struct hns_roce_qp *hr_qp, int n);
-void *get_send_extend_sge(struct hns_roce_qp *hr_qp, int n);
+void init_flush_work(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp);
+void *hns_roce_get_recv_wqe(struct hns_roce_qp *hr_qp, int n);
+void *hns_roce_get_send_wqe(struct hns_roce_qp *hr_qp, int n);
+void *hns_roce_get_extend_sge(struct hns_roce_qp *hr_qp, int n);
 bool hns_roce_wq_overflow(struct hns_roce_wq *hr_wq, int nreq,
 			  struct ib_cq *ib_cq);
 enum hns_roce_qp_state to_hns_roce_state(enum ib_qp_state state);
@@ -1250,19 +1257,15 @@
 void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq,
 			 struct hns_roce_cq *recv_cq);
 void hns_roce_qp_remove(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp);
-void hns_roce_qp_free(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp);
-void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn,
-			       int cnt);
+void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
+			 struct ib_udata *udata);
 __be32 send_ieth(const struct ib_send_wr *wr);
 int to_hr_qp_type(int qp_type);
 
-int hns_roce_ib_create_cq(struct ib_cq *ib_cq,
-			  const struct ib_cq_init_attr *attr,
-			  struct ib_udata *udata);
+int hns_roce_create_cq(struct ib_cq *ib_cq, const struct ib_cq_init_attr *attr,
+		       struct ib_udata *udata);
 
-void hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
-void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq);
-
+int hns_roce_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
 int hns_roce_db_map_user(struct hns_roce_ucontext *context,
 			 struct ib_udata *udata, unsigned long virt,
 			 struct hns_roce_db *db);
@@ -1277,9 +1280,10 @@
 void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type);
 void hns_roce_srq_event(struct hns_roce_dev *hr_dev, u32 srqn, int event_type);
 int hns_get_gid_index(struct hns_roce_dev *hr_dev, u8 port, int gid_index);
+void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev);
 int hns_roce_init(struct hns_roce_dev *hr_dev);
 void hns_roce_exit(struct hns_roce_dev *hr_dev);
 
-int hns_roce_fill_res_entry(struct sk_buff *msg,
-			    struct rdma_restrack_entry *res);
+int hns_roce_fill_res_cq_entry(struct sk_buff *msg,
+			       struct ib_cq *ib_cq);
 #endif /* _HNS_ROCE_DEVICE_H */
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c
index e822157..c880a8b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.c
@@ -39,6 +39,16 @@
 #define DMA_ADDR_T_SHIFT		12
 #define BT_BA_SHIFT			32
 
+#define HEM_INDEX_BUF			BIT(0)
+#define HEM_INDEX_L0			BIT(1)
+#define HEM_INDEX_L1			BIT(2)
+struct hns_roce_hem_index {
+	u64 buf;
+	u64 l0;
+	u64 l1;
+	u32 inited; /* indicate which index is available */
+};
+
 bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type)
 {
 	int hop_num = 0;
@@ -65,18 +75,6 @@
 	case HEM_TYPE_CQC_TIMER:
 		hop_num = hr_dev->caps.cqc_timer_hop_num;
 		break;
-	case HEM_TYPE_CQE:
-		hop_num = hr_dev->caps.cqe_hop_num;
-		break;
-	case HEM_TYPE_MTT:
-		hop_num = hr_dev->caps.mtt_hop_num;
-		break;
-	case HEM_TYPE_SRQWQE:
-		hop_num = hr_dev->caps.srqwqe_hop_num;
-		break;
-	case HEM_TYPE_IDX:
-		hop_num = hr_dev->caps.idx_hop_num;
-		break;
 	default:
 		return false;
 	}
@@ -84,25 +82,27 @@
 	return hop_num ? true : false;
 }
 
-static bool hns_roce_check_hem_null(struct hns_roce_hem **hem, u64 start_idx,
-			    u32 bt_chunk_num, u64 hem_max_num)
+static bool hns_roce_check_hem_null(struct hns_roce_hem **hem, u64 hem_idx,
+				    u32 bt_chunk_num, u64 hem_max_num)
 {
+	u64 start_idx = round_down(hem_idx, bt_chunk_num);
 	u64 check_max_num = start_idx + bt_chunk_num;
 	u64 i;
 
 	for (i = start_idx; (i < check_max_num) && (i < hem_max_num); i++)
-		if (hem[i])
+		if (i != hem_idx && hem[i])
 			return false;
 
 	return true;
 }
 
-static bool hns_roce_check_bt_null(u64 **bt, u64 start_idx, u32 bt_chunk_num)
+static bool hns_roce_check_bt_null(u64 **bt, u64 ba_idx, u32 bt_chunk_num)
 {
+	u64 start_idx = round_down(ba_idx, bt_chunk_num);
 	int i;
 
 	for (i = 0; i < bt_chunk_num; i++)
-		if (bt[start_idx + i])
+		if (i != ba_idx && bt[start_idx + i])
 			return false;
 
 	return true;
@@ -183,40 +183,8 @@
 		mhop->ba_l0_num = hr_dev->caps.srqc_bt_num;
 		mhop->hop_num = hr_dev->caps.srqc_hop_num;
 		break;
-	case HEM_TYPE_MTT:
-		mhop->buf_chunk_size = 1 << (hr_dev->caps.mtt_buf_pg_sz
-					     + PAGE_SHIFT);
-		mhop->bt_chunk_size = 1 << (hr_dev->caps.mtt_ba_pg_sz
-					     + PAGE_SHIFT);
-		mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN;
-		mhop->hop_num = hr_dev->caps.mtt_hop_num;
-		break;
-	case HEM_TYPE_CQE:
-		mhop->buf_chunk_size = 1 << (hr_dev->caps.cqe_buf_pg_sz
-					     + PAGE_SHIFT);
-		mhop->bt_chunk_size = 1 << (hr_dev->caps.cqe_ba_pg_sz
-					     + PAGE_SHIFT);
-		mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN;
-		mhop->hop_num = hr_dev->caps.cqe_hop_num;
-		break;
-	case HEM_TYPE_SRQWQE:
-		mhop->buf_chunk_size = 1 << (hr_dev->caps.srqwqe_buf_pg_sz
-					    + PAGE_SHIFT);
-		mhop->bt_chunk_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz
-					    + PAGE_SHIFT);
-		mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN;
-		mhop->hop_num = hr_dev->caps.srqwqe_hop_num;
-		break;
-	case HEM_TYPE_IDX:
-		mhop->buf_chunk_size = 1 << (hr_dev->caps.idx_buf_pg_sz
-				       + PAGE_SHIFT);
-		mhop->bt_chunk_size = 1 << (hr_dev->caps.idx_ba_pg_sz
-				       + PAGE_SHIFT);
-		mhop->ba_l0_num = mhop->bt_chunk_size / BA_BYTE_LEN;
-		mhop->hop_num = hr_dev->caps.idx_hop_num;
-		break;
 	default:
-		dev_err(dev, "Table %d not support multi-hop addressing!\n",
+		dev_err(dev, "table %u not support multi-hop addressing!\n",
 			type);
 		return -EINVAL;
 	}
@@ -264,8 +232,8 @@
 		mhop->l0_idx = table_idx;
 		break;
 	default:
-		dev_err(dev, "Table %d not support hop_num = %d!\n",
-			     table->type, mhop->hop_num);
+		dev_err(dev, "table %u not support hop_num = %u!\n",
+			table->type, mhop->hop_num);
 		return -EINVAL;
 	}
 	if (mhop->l0_idx >= mhop->ba_l0_num)
@@ -370,8 +338,8 @@
 	void __iomem *bt_cmd;
 	__le32 bt_cmd_val[2];
 	__le32 bt_cmd_h = 0;
-	__le32 bt_cmd_l = 0;
-	u64 bt_ba = 0;
+	__le32 bt_cmd_l;
+	u64 bt_ba;
 	int ret = 0;
 
 	/* Find the HEM(Hardware Entry Memory) entry */
@@ -434,178 +402,235 @@
 	return ret;
 }
 
-static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev,
-				   struct hns_roce_hem_table *table,
-				   unsigned long obj)
+static int calc_hem_config(struct hns_roce_dev *hr_dev,
+			   struct hns_roce_hem_table *table, unsigned long obj,
+			   struct hns_roce_hem_mhop *mhop,
+			   struct hns_roce_hem_index *index)
 {
-	struct device *dev = hr_dev->dev;
-	struct hns_roce_hem_mhop mhop;
-	struct hns_roce_hem_iter iter;
-	u32 buf_chunk_size;
-	u32 bt_chunk_size;
-	u32 chunk_ba_num;
-	u32 hop_num;
-	u32 size;
-	u32 bt_num;
-	u64 hem_idx;
-	u64 bt_l1_idx = 0;
-	u64 bt_l0_idx = 0;
-	u64 bt_ba;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	unsigned long mhop_obj = obj;
-	int bt_l1_allocated = 0;
-	int bt_l0_allocated = 0;
-	int step_idx;
+	u32 l0_idx, l1_idx, l2_idx;
+	u32 chunk_ba_num;
+	u32 bt_num;
 	int ret;
 
-	ret = hns_roce_calc_hem_mhop(hr_dev, table, &mhop_obj, &mhop);
+	ret = hns_roce_calc_hem_mhop(hr_dev, table, &mhop_obj, mhop);
 	if (ret)
 		return ret;
 
-	buf_chunk_size = mhop.buf_chunk_size;
-	bt_chunk_size = mhop.bt_chunk_size;
-	hop_num = mhop.hop_num;
-	chunk_ba_num = bt_chunk_size / BA_BYTE_LEN;
-
-	bt_num = hns_roce_get_bt_num(table->type, hop_num);
+	l0_idx = mhop->l0_idx;
+	l1_idx = mhop->l1_idx;
+	l2_idx = mhop->l2_idx;
+	chunk_ba_num = mhop->bt_chunk_size / BA_BYTE_LEN;
+	bt_num = hns_roce_get_bt_num(table->type, mhop->hop_num);
 	switch (bt_num) {
 	case 3:
-		hem_idx = mhop.l0_idx * chunk_ba_num * chunk_ba_num +
-			  mhop.l1_idx * chunk_ba_num + mhop.l2_idx;
-		bt_l1_idx = mhop.l0_idx * chunk_ba_num + mhop.l1_idx;
-		bt_l0_idx = mhop.l0_idx;
+		index->l1 = l0_idx * chunk_ba_num + l1_idx;
+		index->l0 = l0_idx;
+		index->buf = l0_idx * chunk_ba_num * chunk_ba_num +
+			     l1_idx * chunk_ba_num + l2_idx;
 		break;
 	case 2:
-		hem_idx = mhop.l0_idx * chunk_ba_num + mhop.l1_idx;
-		bt_l0_idx = mhop.l0_idx;
+		index->l0 = l0_idx;
+		index->buf = l0_idx * chunk_ba_num + l1_idx;
 		break;
 	case 1:
-		hem_idx = mhop.l0_idx;
+		index->buf = l0_idx;
 		break;
 	default:
-		dev_err(dev, "Table %d not support hop_num = %d!\n",
-			     table->type, hop_num);
+		ibdev_err(ibdev, "table %u not support mhop.hop_num = %u!\n",
+			  table->type, mhop->hop_num);
 		return -EINVAL;
 	}
 
-	if (unlikely(hem_idx >= table->num_hem)) {
-		dev_err(dev, "Table %d exceed hem limt idx = %llu,max = %lu!\n",
-			     table->type, hem_idx, table->num_hem);
+	if (unlikely(index->buf >= table->num_hem)) {
+		ibdev_err(ibdev, "table %u exceed hem limt idx %llu, max %lu!\n",
+			  table->type, index->buf, table->num_hem);
 		return -EINVAL;
 	}
 
-	mutex_lock(&table->mutex);
+	return 0;
+}
 
-	if (table->hem[hem_idx]) {
-		++table->hem[hem_idx]->refcount;
-		goto out;
+static void free_mhop_hem(struct hns_roce_dev *hr_dev,
+			  struct hns_roce_hem_table *table,
+			  struct hns_roce_hem_mhop *mhop,
+			  struct hns_roce_hem_index *index)
+{
+	u32 bt_size = mhop->bt_chunk_size;
+	struct device *dev = hr_dev->dev;
+
+	if (index->inited & HEM_INDEX_BUF) {
+		hns_roce_free_hem(hr_dev, table->hem[index->buf]);
+		table->hem[index->buf] = NULL;
 	}
 
+	if (index->inited & HEM_INDEX_L1) {
+		dma_free_coherent(dev, bt_size, table->bt_l1[index->l1],
+				  table->bt_l1_dma_addr[index->l1]);
+		table->bt_l1[index->l1] = NULL;
+	}
+
+	if (index->inited & HEM_INDEX_L0) {
+		dma_free_coherent(dev, bt_size, table->bt_l0[index->l0],
+				  table->bt_l0_dma_addr[index->l0]);
+		table->bt_l0[index->l0] = NULL;
+	}
+}
+
+static int alloc_mhop_hem(struct hns_roce_dev *hr_dev,
+			  struct hns_roce_hem_table *table,
+			  struct hns_roce_hem_mhop *mhop,
+			  struct hns_roce_hem_index *index)
+{
+	u32 bt_size = mhop->bt_chunk_size;
+	struct device *dev = hr_dev->dev;
+	struct hns_roce_hem_iter iter;
+	gfp_t flag;
+	u64 bt_ba;
+	u32 size;
+	int ret;
+
 	/* alloc L1 BA's chunk */
-	if ((check_whether_bt_num_3(table->type, hop_num) ||
-		check_whether_bt_num_2(table->type, hop_num)) &&
-		!table->bt_l0[bt_l0_idx]) {
-		table->bt_l0[bt_l0_idx] = dma_alloc_coherent(dev, bt_chunk_size,
-					    &(table->bt_l0_dma_addr[bt_l0_idx]),
+	if ((check_whether_bt_num_3(table->type, mhop->hop_num) ||
+	     check_whether_bt_num_2(table->type, mhop->hop_num)) &&
+	     !table->bt_l0[index->l0]) {
+		table->bt_l0[index->l0] = dma_alloc_coherent(dev, bt_size,
+					    &table->bt_l0_dma_addr[index->l0],
 					    GFP_KERNEL);
-		if (!table->bt_l0[bt_l0_idx]) {
+		if (!table->bt_l0[index->l0]) {
 			ret = -ENOMEM;
 			goto out;
 		}
-		bt_l0_allocated = 1;
-
-		/* set base address to hardware */
-		if (table->type < HEM_TYPE_MTT) {
-			step_idx = 0;
-			if (hr_dev->hw->set_hem(hr_dev, table, obj, step_idx)) {
-				ret = -ENODEV;
-				dev_err(dev, "set HEM base address to HW failed!\n");
-				goto err_dma_alloc_l1;
-			}
-		}
+		index->inited |= HEM_INDEX_L0;
 	}
 
 	/* alloc L2 BA's chunk */
-	if (check_whether_bt_num_3(table->type, hop_num) &&
-	    !table->bt_l1[bt_l1_idx])  {
-		table->bt_l1[bt_l1_idx] = dma_alloc_coherent(dev, bt_chunk_size,
-					    &(table->bt_l1_dma_addr[bt_l1_idx]),
+	if (check_whether_bt_num_3(table->type, mhop->hop_num) &&
+	    !table->bt_l1[index->l1])  {
+		table->bt_l1[index->l1] = dma_alloc_coherent(dev, bt_size,
+					    &table->bt_l1_dma_addr[index->l1],
 					    GFP_KERNEL);
-		if (!table->bt_l1[bt_l1_idx]) {
+		if (!table->bt_l1[index->l1]) {
 			ret = -ENOMEM;
-			goto err_dma_alloc_l1;
+			goto err_alloc_hem;
 		}
-		bt_l1_allocated = 1;
-		*(table->bt_l0[bt_l0_idx] + mhop.l1_idx) =
-					       table->bt_l1_dma_addr[bt_l1_idx];
-
-		/* set base address to hardware */
-		step_idx = 1;
-		if (hr_dev->hw->set_hem(hr_dev, table, obj, step_idx)) {
-			ret = -ENODEV;
-			dev_err(dev, "set HEM base address to HW failed!\n");
-			goto err_alloc_hem_buf;
-		}
+		index->inited |= HEM_INDEX_L1;
+		*(table->bt_l0[index->l0] + mhop->l1_idx) =
+					       table->bt_l1_dma_addr[index->l1];
 	}
 
 	/*
 	 * alloc buffer space chunk for QPC/MTPT/CQC/SRQC/SCCC.
 	 * alloc bt space chunk for MTT/CQE.
 	 */
-	size = table->type < HEM_TYPE_MTT ? buf_chunk_size : bt_chunk_size;
-	table->hem[hem_idx] = hns_roce_alloc_hem(hr_dev,
-						size >> PAGE_SHIFT,
-						size,
-						(table->lowmem ? GFP_KERNEL :
-						GFP_HIGHUSER) | __GFP_NOWARN);
-	if (!table->hem[hem_idx]) {
+	size = table->type < HEM_TYPE_MTT ? mhop->buf_chunk_size : bt_size;
+	flag = (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) | __GFP_NOWARN;
+	table->hem[index->buf] = hns_roce_alloc_hem(hr_dev, size >> PAGE_SHIFT,
+						    size, flag);
+	if (!table->hem[index->buf]) {
 		ret = -ENOMEM;
-		goto err_alloc_hem_buf;
+		goto err_alloc_hem;
 	}
 
-	hns_roce_hem_first(table->hem[hem_idx], &iter);
+	index->inited |= HEM_INDEX_BUF;
+	hns_roce_hem_first(table->hem[index->buf], &iter);
 	bt_ba = hns_roce_hem_addr(&iter);
-
 	if (table->type < HEM_TYPE_MTT) {
-		if (hop_num == 2) {
-			*(table->bt_l1[bt_l1_idx] + mhop.l2_idx) = bt_ba;
-			step_idx = 2;
-		} else if (hop_num == 1) {
-			*(table->bt_l0[bt_l0_idx] + mhop.l1_idx) = bt_ba;
-			step_idx = 1;
-		} else if (hop_num == HNS_ROCE_HOP_NUM_0) {
-			step_idx = 0;
-		} else {
-			ret = -EINVAL;
-			goto err_dma_alloc_l1;
-		}
-
-		/* set HEM base address to hardware */
-		if (hr_dev->hw->set_hem(hr_dev, table, obj, step_idx)) {
-			ret = -ENODEV;
-			dev_err(dev, "set HEM base address to HW failed!\n");
-			goto err_alloc_hem_buf;
-		}
-	} else if (hop_num == 2) {
-		*(table->bt_l0[bt_l0_idx] + mhop.l1_idx) = bt_ba;
+		if (mhop->hop_num == 2)
+			*(table->bt_l1[index->l1] + mhop->l2_idx) = bt_ba;
+		else if (mhop->hop_num == 1)
+			*(table->bt_l0[index->l0] + mhop->l1_idx) = bt_ba;
+	} else if (mhop->hop_num == 2) {
+		*(table->bt_l0[index->l0] + mhop->l1_idx) = bt_ba;
 	}
 
-	++table->hem[hem_idx]->refcount;
+	return 0;
+err_alloc_hem:
+	free_mhop_hem(hr_dev, table, mhop, index);
+out:
+	return ret;
+}
+
+static int set_mhop_hem(struct hns_roce_dev *hr_dev,
+			struct hns_roce_hem_table *table, unsigned long obj,
+			struct hns_roce_hem_mhop *mhop,
+			struct hns_roce_hem_index *index)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	int step_idx;
+	int ret = 0;
+
+	if (index->inited & HEM_INDEX_L0) {
+		ret = hr_dev->hw->set_hem(hr_dev, table, obj, 0);
+		if (ret) {
+			ibdev_err(ibdev, "set HEM step 0 failed!\n");
+			goto out;
+		}
+	}
+
+	if (index->inited & HEM_INDEX_L1) {
+		ret = hr_dev->hw->set_hem(hr_dev, table, obj, 1);
+		if (ret) {
+			ibdev_err(ibdev, "set HEM step 1 failed!\n");
+			goto out;
+		}
+	}
+
+	if (index->inited & HEM_INDEX_BUF) {
+		if (mhop->hop_num == HNS_ROCE_HOP_NUM_0)
+			step_idx = 0;
+		else
+			step_idx = mhop->hop_num;
+		ret = hr_dev->hw->set_hem(hr_dev, table, obj, step_idx);
+		if (ret)
+			ibdev_err(ibdev, "set HEM step last failed!\n");
+	}
+out:
+	return ret;
+}
+
+static int hns_roce_table_mhop_get(struct hns_roce_dev *hr_dev,
+				   struct hns_roce_hem_table *table,
+				   unsigned long obj)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_hem_index index = {};
+	struct hns_roce_hem_mhop mhop = {};
+	int ret;
+
+	ret = calc_hem_config(hr_dev, table, obj, &mhop, &index);
+	if (ret) {
+		ibdev_err(ibdev, "calc hem config failed!\n");
+		return ret;
+	}
+
+	mutex_lock(&table->mutex);
+	if (table->hem[index.buf]) {
+		++table->hem[index.buf]->refcount;
+		goto out;
+	}
+
+	ret = alloc_mhop_hem(hr_dev, table, &mhop, &index);
+	if (ret) {
+		ibdev_err(ibdev, "alloc mhop hem failed!\n");
+		goto out;
+	}
+
+	/* set HEM base address to hardware */
+	if (table->type < HEM_TYPE_MTT) {
+		ret = set_mhop_hem(hr_dev, table, obj, &mhop, &index);
+		if (ret) {
+			ibdev_err(ibdev, "set HEM address to HW failed!\n");
+			goto err_alloc;
+		}
+	}
+
+	++table->hem[index.buf]->refcount;
 	goto out;
 
-err_alloc_hem_buf:
-	if (bt_l1_allocated) {
-		dma_free_coherent(dev, bt_chunk_size, table->bt_l1[bt_l1_idx],
-				  table->bt_l1_dma_addr[bt_l1_idx]);
-		table->bt_l1[bt_l1_idx] = NULL;
-	}
-
-err_dma_alloc_l1:
-	if (bt_l0_allocated) {
-		dma_free_coherent(dev, bt_chunk_size, table->bt_l0[bt_l0_idx],
-				  table->bt_l0_dma_addr[bt_l0_idx]);
-		table->bt_l0[bt_l0_idx] = NULL;
-	}
-
+err_alloc:
+	free_mhop_hem(hr_dev, table, &mhop, &index);
 out:
 	mutex_unlock(&table->mutex);
 	return ret;
@@ -656,116 +681,75 @@
 	return ret;
 }
 
+static void clear_mhop_hem(struct hns_roce_dev *hr_dev,
+			   struct hns_roce_hem_table *table, unsigned long obj,
+			   struct hns_roce_hem_mhop *mhop,
+			   struct hns_roce_hem_index *index)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	u32 hop_num = mhop->hop_num;
+	u32 chunk_ba_num;
+	int step_idx;
+
+	index->inited = HEM_INDEX_BUF;
+	chunk_ba_num = mhop->bt_chunk_size / BA_BYTE_LEN;
+	if (check_whether_bt_num_2(table->type, hop_num)) {
+		if (hns_roce_check_hem_null(table->hem, index->buf,
+					    chunk_ba_num, table->num_hem))
+			index->inited |= HEM_INDEX_L0;
+	} else if (check_whether_bt_num_3(table->type, hop_num)) {
+		if (hns_roce_check_hem_null(table->hem, index->buf,
+					    chunk_ba_num, table->num_hem)) {
+			index->inited |= HEM_INDEX_L1;
+			if (hns_roce_check_bt_null(table->bt_l1, index->l1,
+						   chunk_ba_num))
+				index->inited |= HEM_INDEX_L0;
+		}
+	}
+
+	if (table->type < HEM_TYPE_MTT) {
+		if (hop_num == HNS_ROCE_HOP_NUM_0)
+			step_idx = 0;
+		else
+			step_idx = hop_num;
+
+		if (hr_dev->hw->clear_hem(hr_dev, table, obj, step_idx))
+			ibdev_warn(ibdev, "failed to clear hop%u HEM.\n", hop_num);
+
+		if (index->inited & HEM_INDEX_L1)
+			if (hr_dev->hw->clear_hem(hr_dev, table, obj, 1))
+				ibdev_warn(ibdev, "failed to clear HEM step 1.\n");
+
+		if (index->inited & HEM_INDEX_L0)
+			if (hr_dev->hw->clear_hem(hr_dev, table, obj, 0))
+				ibdev_warn(ibdev, "failed to clear HEM step 0.\n");
+	}
+}
+
 static void hns_roce_table_mhop_put(struct hns_roce_dev *hr_dev,
 				    struct hns_roce_hem_table *table,
 				    unsigned long obj,
 				    int check_refcount)
 {
-	struct device *dev = hr_dev->dev;
-	struct hns_roce_hem_mhop mhop;
-	unsigned long mhop_obj = obj;
-	u32 bt_chunk_size;
-	u32 chunk_ba_num;
-	u32 hop_num;
-	u32 start_idx;
-	u32 bt_num;
-	u64 hem_idx;
-	u64 bt_l1_idx = 0;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_hem_index index = {};
+	struct hns_roce_hem_mhop mhop = {};
 	int ret;
 
-	ret = hns_roce_calc_hem_mhop(hr_dev, table, &mhop_obj, &mhop);
-	if (ret)
-		return;
-
-	bt_chunk_size = mhop.bt_chunk_size;
-	hop_num = mhop.hop_num;
-	chunk_ba_num = bt_chunk_size / BA_BYTE_LEN;
-
-	bt_num = hns_roce_get_bt_num(table->type, hop_num);
-	switch (bt_num) {
-	case 3:
-		hem_idx = mhop.l0_idx * chunk_ba_num * chunk_ba_num +
-			  mhop.l1_idx * chunk_ba_num + mhop.l2_idx;
-		bt_l1_idx = mhop.l0_idx * chunk_ba_num + mhop.l1_idx;
-		break;
-	case 2:
-		hem_idx = mhop.l0_idx * chunk_ba_num + mhop.l1_idx;
-		break;
-	case 1:
-		hem_idx = mhop.l0_idx;
-		break;
-	default:
-		dev_err(dev, "Table %d not support hop_num = %d!\n",
-			     table->type, hop_num);
+	ret = calc_hem_config(hr_dev, table, obj, &mhop, &index);
+	if (ret) {
+		ibdev_err(ibdev, "calc hem config failed!\n");
 		return;
 	}
 
 	mutex_lock(&table->mutex);
-
-	if (check_refcount && (--table->hem[hem_idx]->refcount > 0)) {
+	if (check_refcount && (--table->hem[index.buf]->refcount > 0)) {
 		mutex_unlock(&table->mutex);
 		return;
 	}
 
-	if (table->type < HEM_TYPE_MTT && hop_num == 1) {
-		if (hr_dev->hw->clear_hem(hr_dev, table, obj, 1))
-			dev_warn(dev, "Clear HEM base address failed.\n");
-	} else if (table->type < HEM_TYPE_MTT && hop_num == 2) {
-		if (hr_dev->hw->clear_hem(hr_dev, table, obj, 2))
-			dev_warn(dev, "Clear HEM base address failed.\n");
-	} else if (table->type < HEM_TYPE_MTT &&
-		   hop_num == HNS_ROCE_HOP_NUM_0) {
-		if (hr_dev->hw->clear_hem(hr_dev, table, obj, 0))
-			dev_warn(dev, "Clear HEM base address failed.\n");
-	}
-
-	/*
-	 * free buffer space chunk for QPC/MTPT/CQC/SRQC/SCCC.
-	 * free bt space chunk for MTT/CQE.
-	 */
-	hns_roce_free_hem(hr_dev, table->hem[hem_idx]);
-	table->hem[hem_idx] = NULL;
-
-	if (check_whether_bt_num_2(table->type, hop_num)) {
-		start_idx = mhop.l0_idx * chunk_ba_num;
-		if (hns_roce_check_hem_null(table->hem, start_idx,
-					    chunk_ba_num, table->num_hem)) {
-			if (table->type < HEM_TYPE_MTT &&
-			    hr_dev->hw->clear_hem(hr_dev, table, obj, 0))
-				dev_warn(dev, "Clear HEM base address failed.\n");
-
-			dma_free_coherent(dev, bt_chunk_size,
-					  table->bt_l0[mhop.l0_idx],
-					  table->bt_l0_dma_addr[mhop.l0_idx]);
-			table->bt_l0[mhop.l0_idx] = NULL;
-		}
-	} else if (check_whether_bt_num_3(table->type, hop_num)) {
-		start_idx = mhop.l0_idx * chunk_ba_num * chunk_ba_num +
-			    mhop.l1_idx * chunk_ba_num;
-		if (hns_roce_check_hem_null(table->hem, start_idx,
-					    chunk_ba_num, table->num_hem)) {
-			if (hr_dev->hw->clear_hem(hr_dev, table, obj, 1))
-				dev_warn(dev, "Clear HEM base address failed.\n");
-
-			dma_free_coherent(dev, bt_chunk_size,
-					  table->bt_l1[bt_l1_idx],
-					  table->bt_l1_dma_addr[bt_l1_idx]);
-			table->bt_l1[bt_l1_idx] = NULL;
-
-			start_idx = mhop.l0_idx * chunk_ba_num;
-			if (hns_roce_check_bt_null(table->bt_l1, start_idx,
-						   chunk_ba_num)) {
-				if (hr_dev->hw->clear_hem(hr_dev, table, obj,
-							  0))
-					dev_warn(dev, "Clear HEM base address failed.\n");
-
-				dma_free_coherent(dev, bt_chunk_size,
-					    table->bt_l0[mhop.l0_idx],
-					    table->bt_l0_dma_addr[mhop.l0_idx]);
-				table->bt_l0[mhop.l0_idx] = NULL;
-			}
-		}
-	}
+	clear_mhop_hem(hr_dev, table, obj, &mhop, &index);
+	free_mhop_hem(hr_dev, table, &mhop, &index);
 
 	mutex_unlock(&table->mutex);
 }
@@ -871,57 +855,6 @@
 	return addr;
 }
 
-int hns_roce_table_get_range(struct hns_roce_dev *hr_dev,
-			     struct hns_roce_hem_table *table,
-			     unsigned long start, unsigned long end)
-{
-	struct hns_roce_hem_mhop mhop;
-	unsigned long inc = table->table_chunk_size / table->obj_size;
-	unsigned long i = 0;
-	int ret;
-
-	if (hns_roce_check_whether_mhop(hr_dev, table->type)) {
-		ret = hns_roce_calc_hem_mhop(hr_dev, table, NULL, &mhop);
-		if (ret)
-			goto fail;
-		inc = mhop.bt_chunk_size / table->obj_size;
-	}
-
-	/* Allocate MTT entry memory according to chunk(128K) */
-	for (i = start; i <= end; i += inc) {
-		ret = hns_roce_table_get(hr_dev, table, i);
-		if (ret)
-			goto fail;
-	}
-
-	return 0;
-
-fail:
-	while (i > start) {
-		i -= inc;
-		hns_roce_table_put(hr_dev, table, i);
-	}
-	return ret;
-}
-
-void hns_roce_table_put_range(struct hns_roce_dev *hr_dev,
-			      struct hns_roce_hem_table *table,
-			      unsigned long start, unsigned long end)
-{
-	struct hns_roce_hem_mhop mhop;
-	unsigned long inc = table->table_chunk_size / table->obj_size;
-	unsigned long i;
-
-	if (hns_roce_check_whether_mhop(hr_dev, table->type)) {
-		if (hns_roce_calc_hem_mhop(hr_dev, table, NULL, &mhop))
-			return;
-		inc = mhop.bt_chunk_size / table->obj_size;
-	}
-
-	for (i = start; i <= end; i += inc)
-		hns_roce_table_put(hr_dev, table, i);
-}
-
 int hns_roce_init_hem_table(struct hns_roce_dev *hr_dev,
 			    struct hns_roce_hem_table *table, u32 type,
 			    unsigned long obj_size, unsigned long nobj,
@@ -1084,13 +1017,7 @@
 
 void hns_roce_cleanup_hem(struct hns_roce_dev *hr_dev)
 {
-	if ((hr_dev->caps.num_idx_segs))
-		hns_roce_cleanup_hem_table(hr_dev,
-					   &hr_dev->mr_table.mtt_idx_table);
-	if (hr_dev->caps.num_srqwqe_segs)
-		hns_roce_cleanup_hem_table(hr_dev,
-					   &hr_dev->mr_table.mtt_srqwqe_table);
-	if (hr_dev->caps.srqc_entry_sz)
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ)
 		hns_roce_cleanup_hem_table(hr_dev,
 					   &hr_dev->srq_table.table);
 	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->cq_table.table);
@@ -1100,7 +1027,7 @@
 	if (hr_dev->caps.cqc_timer_entry_sz)
 		hns_roce_cleanup_hem_table(hr_dev,
 					   &hr_dev->cqc_timer_table);
-	if (hr_dev->caps.sccc_entry_sz)
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL)
 		hns_roce_cleanup_hem_table(hr_dev,
 					   &hr_dev->qp_table.sccc_table);
 	if (hr_dev->caps.trrl_entry_sz)
@@ -1109,10 +1036,6 @@
 	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->qp_table.irrl_table);
 	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->qp_table.qp_table);
 	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtpt_table);
-	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
-		hns_roce_cleanup_hem_table(hr_dev,
-					   &hr_dev->mr_table.mtt_cqe_table);
-	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtt_table);
 }
 
 struct roce_hem_item {
@@ -1311,7 +1234,7 @@
 	}
 
 	if (offset < r->offset) {
-		dev_err(hr_dev->dev, "invalid offset %d,min %d!\n",
+		dev_err(hr_dev->dev, "invalid offset %d, min %u!\n",
 			offset, r->offset);
 		return -EINVAL;
 	}
@@ -1383,6 +1306,7 @@
 	void *cpu_base;
 	u64 phy_base;
 	int ret = 0;
+	int ba_num;
 	int offset;
 	int total;
 	int step;
@@ -1393,12 +1317,16 @@
 	if (root_hem)
 		return 0;
 
+	ba_num = hns_roce_hem_list_calc_root_ba(regions, region_cnt, unit);
+	if (ba_num < 1)
+		return -ENOMEM;
+
 	INIT_LIST_HEAD(&temp_root);
-	total = r->offset;
+	offset = r->offset;
 	/* indicate to last region */
 	r = &regions[region_cnt - 1];
-	root_hem = hem_list_alloc_item(hr_dev, total, r->offset + r->count - 1,
-				       unit, true, 0);
+	root_hem = hem_list_alloc_item(hr_dev, offset, r->offset + r->count - 1,
+				       ba_num, true, 0);
 	if (!root_hem)
 		return -ENOMEM;
 	list_add(&root_hem->list, &temp_root);
@@ -1410,7 +1338,7 @@
 		INIT_LIST_HEAD(&temp_list[i]);
 
 	total = 0;
-	for (i = 0; i < region_cnt && total < unit; i++) {
+	for (i = 0; i < region_cnt && total < ba_num; i++) {
 		r = &regions[i];
 		if (!r->count)
 			continue;
@@ -1443,7 +1371,8 @@
 			/* if exist mid bt, link L1 to L0 */
 			list_for_each_entry_safe(hem, temp_hem,
 					  &hem_list->mid_bt[i][1], list) {
-				offset = hem->start / step * BA_BYTE_LEN;
+				offset = (hem->start - r->offset) / step *
+					  BA_BYTE_LEN;
 				hem_list_link_bt(hr_dev, cpu_base + offset,
 						 hem->dma_addr);
 				total++;
@@ -1471,11 +1400,11 @@
 int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev,
 			      struct hns_roce_hem_list *hem_list,
 			      const struct hns_roce_buf_region *regions,
-			      int region_cnt)
+			      int region_cnt, unsigned int bt_pg_shift)
 {
 	const struct hns_roce_buf_region *r;
 	int ofs, end;
-	int ret = 0;
+	int ret;
 	int unit;
 	int i;
 
@@ -1485,7 +1414,7 @@
 		return -EINVAL;
 	}
 
-	unit = (1 << hem_list->bt_pg_shift) / BA_BYTE_LEN;
+	unit = (1 << bt_pg_shift) / BA_BYTE_LEN;
 	for (i = 0; i < region_cnt; i++) {
 		r = &regions[i];
 		if (!r->count)
@@ -1532,8 +1461,7 @@
 	hem_list->root_ba = 0;
 }
 
-void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list,
-			    int bt_page_order)
+void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list)
 {
 	int i, j;
 
@@ -1542,8 +1470,6 @@
 	for (i = 0; i < HNS_ROCE_MAX_BT_REGION; i++)
 		for (j = 0; j < HNS_ROCE_MAX_BT_LEVEL; j++)
 			INIT_LIST_HEAD(&hem_list->mid_bt[i][j]);
-
-	hem_list->bt_pg_shift = bt_page_order;
 }
 
 void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev,
diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h b/drivers/infiniband/hw/hns/hns_roce_hem.h
index 3bb8f78..b34c940 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hem.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hem.h
@@ -115,12 +115,6 @@
 void *hns_roce_table_find(struct hns_roce_dev *hr_dev,
 			  struct hns_roce_hem_table *table, unsigned long obj,
 			  dma_addr_t *dma_handle);
-int hns_roce_table_get_range(struct hns_roce_dev *hr_dev,
-			     struct hns_roce_hem_table *table,
-			     unsigned long start, unsigned long end);
-void hns_roce_table_put_range(struct hns_roce_dev *hr_dev,
-			      struct hns_roce_hem_table *table,
-			      unsigned long start, unsigned long end);
 int hns_roce_init_hem_table(struct hns_roce_dev *hr_dev,
 			    struct hns_roce_hem_table *table, u32 type,
 			    unsigned long obj_size, unsigned long nobj,
@@ -133,14 +127,13 @@
 			   struct hns_roce_hem_mhop *mhop);
 bool hns_roce_check_whether_mhop(struct hns_roce_dev *hr_dev, u32 type);
 
-void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list,
-			    int bt_page_order);
+void hns_roce_hem_list_init(struct hns_roce_hem_list *hem_list);
 int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions,
 				   int region_cnt, int unit);
 int hns_roce_hem_list_request(struct hns_roce_dev *hr_dev,
 			      struct hns_roce_hem_list *hem_list,
 			      const struct hns_roce_buf_region *regions,
-			      int region_cnt);
+			      int region_cnt, unsigned int bt_pg_shift);
 void hns_roce_hem_list_release(struct hns_roce_dev *hr_dev,
 			       struct hns_roce_hem_list *hem_list);
 void *hns_roce_hem_list_find_mtt(struct hns_roce_dev *hr_dev,
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index a405c64..5f4d8a3 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -69,16 +69,16 @@
 	struct hns_roce_wqe_data_seg *dseg = NULL;
 	struct hns_roce_qp *qp = to_hr_qp(ibqp);
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_sq_db sq_db;
-	int ps_opcode = 0, i = 0;
+	struct hns_roce_sq_db sq_db = {};
+	int ps_opcode, i;
 	unsigned long flags = 0;
 	void *wqe = NULL;
 	__le32 doorbell[2];
-	u32 wqe_idx = 0;
-	int nreq = 0;
 	int ret = 0;
-	u8 *smac;
 	int loopback;
+	u32 wqe_idx;
+	int nreq;
+	u8 *smac;
 
 	if (unlikely(ibqp->qp_type != IB_QPT_GSI &&
 		ibqp->qp_type != IB_QPT_RC)) {
@@ -106,7 +106,7 @@
 			goto out;
 		}
 
-		wqe = get_send_wqe(qp, wqe_idx);
+		wqe = hns_roce_get_send_wqe(qp, wqe_idx);
 		qp->sq.wrid[wqe_idx] = wr->wr_id;
 
 		/* Corresponding to the RC and RD type wqe process separately */
@@ -317,8 +317,6 @@
 		/* Memory barrier */
 		wmb();
 
-		sq_db.u32_4 = 0;
-		sq_db.u32_8 = 0;
 		roce_set_field(sq_db.u32_4, SQ_DOORBELL_U32_4_SQ_HEAD_M,
 			       SQ_DOORBELL_U32_4_SQ_HEAD_S,
 			      (qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1)));
@@ -350,7 +348,7 @@
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_rq_db rq_db;
+	struct hns_roce_rq_db rq_db = {};
 	__le32 doorbell[2] = {0};
 	unsigned long flags = 0;
 	unsigned int wqe_idx;
@@ -379,7 +377,7 @@
 			goto out;
 		}
 
-		ctrl = get_recv_wqe(hr_qp, wqe_idx);
+		ctrl = hns_roce_get_recv_wqe(hr_qp, wqe_idx);
 
 		roce_set_field(ctrl->rwqe_byte_12,
 			       RQ_WQE_CTRL_RWQE_BYTE_12_RWQE_SGE_NUM_M,
@@ -417,9 +415,6 @@
 				   ROCEE_QP1C_CFG3_0_REG +
 				   QP1C_CFGN_OFFSET * hr_qp->phy_port, reg_val);
 		} else {
-			rq_db.u32_4 = 0;
-			rq_db.u32_8 = 0;
-
 			roce_set_field(rq_db.u32_4, RQ_DOORBELL_U32_4_RQ_HEAD_M,
 				       RQ_DOORBELL_U32_4_RQ_HEAD_S,
 				       hr_qp->rq.head);
@@ -507,16 +502,13 @@
 static void hns_roce_set_sdb_ext(struct hns_roce_dev *hr_dev, u32 ext_sdb_alept,
 				 u32 ext_sdb_alful)
 {
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_db_table *db = &priv->db_table;
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_v1_priv *priv;
-	struct hns_roce_db_table *db;
 	dma_addr_t sdb_dma_addr;
 	__le32 tmp;
 	u32 val;
 
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	db = &priv->db_table;
-
 	/* Configure extend SDB threshold */
 	roce_write(hr_dev, ROCEE_EXT_DB_SQ_WL_EMPTY_REG, ext_sdb_alept);
 	roce_write(hr_dev, ROCEE_EXT_DB_SQ_WL_REG, ext_sdb_alful);
@@ -542,23 +534,20 @@
 	roce_write(hr_dev, ROCEE_EXT_DB_SQ_H_REG, val);
 
 	dev_dbg(dev, "ext SDB depth: 0x%x\n", db->ext_db->esdb_dep);
-	dev_dbg(dev, "ext SDB threshold: epmty: 0x%x, ful: 0x%x\n",
+	dev_dbg(dev, "ext SDB threshold: empty: 0x%x, ful: 0x%x\n",
 		ext_sdb_alept, ext_sdb_alful);
 }
 
 static void hns_roce_set_odb_ext(struct hns_roce_dev *hr_dev, u32 ext_odb_alept,
 				 u32 ext_odb_alful)
 {
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_db_table *db = &priv->db_table;
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_v1_priv *priv;
-	struct hns_roce_db_table *db;
 	dma_addr_t odb_dma_addr;
 	__le32 tmp;
 	u32 val;
 
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	db = &priv->db_table;
-
 	/* Configure extend ODB threshold */
 	roce_write(hr_dev, ROCEE_EXT_DB_OTHERS_WL_EMPTY_REG, ext_odb_alept);
 	roce_write(hr_dev, ROCEE_EXT_DB_OTHERS_WL_REG, ext_odb_alful);
@@ -587,16 +576,13 @@
 static int hns_roce_db_ext_init(struct hns_roce_dev *hr_dev, u32 sdb_ext_mod,
 				u32 odb_ext_mod)
 {
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_db_table *db = &priv->db_table;
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_v1_priv *priv;
-	struct hns_roce_db_table *db;
 	dma_addr_t sdb_dma_addr;
 	dma_addr_t odb_dma_addr;
 	int ret = 0;
 
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	db = &priv->db_table;
-
 	db->ext_db = kmalloc(sizeof(*db->ext_db), GFP_KERNEL);
 	if (!db->ext_db)
 		return -ENOMEM;
@@ -696,14 +682,14 @@
 
 static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
 {
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_free_mr *free_mr = &priv->free_mr;
 	struct hns_roce_caps *caps = &hr_dev->caps;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct device *dev = &hr_dev->pdev->dev;
 	struct ib_cq_init_attr cq_init_attr;
-	struct hns_roce_free_mr *free_mr;
 	struct ib_qp_attr attr = { 0 };
-	struct hns_roce_v1_priv *priv;
 	struct hns_roce_qp *hr_qp;
-	struct ib_device *ibdev;
 	struct ib_cq *cq;
 	struct ib_pd *pd;
 	union ib_gid dgid;
@@ -716,19 +702,15 @@
 	u8 port = 0;
 	u8 sl;
 
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	free_mr = &priv->free_mr;
-
 	/* Reserved cq for loop qp */
 	cq_init_attr.cqe		= HNS_ROCE_MIN_WQE_NUM * 2;
 	cq_init_attr.comp_vector	= 0;
 
-	ibdev = &hr_dev->ib_dev;
 	cq = rdma_zalloc_drv_obj(ibdev, ib_cq);
 	if (!cq)
 		return -ENOMEM;
 
-	ret = hns_roce_ib_create_cq(cq, &cq_init_attr, NULL);
+	ret = hns_roce_create_cq(cq, &cq_init_attr, NULL);
 	if (ret) {
 		dev_err(dev, "Create cq for reserved loop qp failed!");
 		goto alloc_cq_failed;
@@ -864,7 +846,7 @@
 	kfree(pd);
 
 alloc_mem_failed:
-	hns_roce_ib_destroy_cq(cq, NULL);
+	hns_roce_destroy_cq(cq, NULL);
 alloc_cq_failed:
 	kfree(cq);
 	return ret;
@@ -872,16 +854,13 @@
 
 static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev)
 {
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_free_mr *free_mr = &priv->free_mr;
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_free_mr *free_mr;
-	struct hns_roce_v1_priv *priv;
 	struct hns_roce_qp *hr_qp;
 	int ret;
 	int i;
 
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	free_mr = &priv->free_mr;
-
 	for (i = 0; i < HNS_ROCE_V1_RESV_QP; i++) {
 		hr_qp = free_mr->mr_free_qp[i];
 		if (!hr_qp)
@@ -893,7 +872,7 @@
 				i, ret);
 	}
 
-	hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq, NULL);
+	hns_roce_destroy_cq(&free_mr->mr_free_cq->ib_cq, NULL);
 	kfree(&free_mr->mr_free_cq->ib_cq);
 	hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd, NULL);
 	kfree(&free_mr->mr_free_pd->ibpd);
@@ -901,17 +880,14 @@
 
 static int hns_roce_db_init(struct hns_roce_dev *hr_dev)
 {
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_db_table *db = &priv->db_table;
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_v1_priv *priv;
-	struct hns_roce_db_table *db;
 	u32 sdb_ext_mod;
 	u32 odb_ext_mod;
 	u32 sdb_evt_mod;
 	u32 odb_evt_mod;
-	int ret = 0;
-
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	db = &priv->db_table;
+	int ret;
 
 	memset(db, 0, sizeof(*db));
 
@@ -958,15 +934,12 @@
 
 static int hns_roce_v1_recreate_lp_qp(struct hns_roce_dev *hr_dev)
 {
-	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_recreate_lp_qp_work *lp_qp_work;
-	struct hns_roce_free_mr *free_mr;
-	struct hns_roce_v1_priv *priv;
-	struct completion comp;
 	long end = HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS;
-
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	free_mr = &priv->free_mr;
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_free_mr *free_mr = &priv->free_mr;
+	struct hns_roce_recreate_lp_qp_work *lp_qp_work;
+	struct device *dev = &hr_dev->pdev->dev;
+	struct completion comp;
 
 	lp_qp_work = kzalloc(sizeof(struct hns_roce_recreate_lp_qp_work),
 			     GFP_KERNEL);
@@ -1025,29 +998,21 @@
 
 static void hns_roce_v1_mr_free_work_fn(struct work_struct *work)
 {
-	struct hns_roce_mr_free_work *mr_work;
-	struct ib_wc wc[HNS_ROCE_V1_RESV_QP];
-	struct hns_roce_free_mr *free_mr;
-	struct hns_roce_cq *mr_free_cq;
-	struct hns_roce_v1_priv *priv;
-	struct hns_roce_dev *hr_dev;
-	struct hns_roce_mr *hr_mr;
-	struct hns_roce_qp *hr_qp;
-	struct device *dev;
 	unsigned long end =
 		msecs_to_jiffies(HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS) + jiffies;
-	int i;
-	int ret;
+	struct hns_roce_mr_free_work *mr_work =
+		container_of(work, struct hns_roce_mr_free_work, work);
+	struct hns_roce_dev *hr_dev = to_hr_dev(mr_work->ib_dev);
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_free_mr *free_mr = &priv->free_mr;
+	struct hns_roce_cq *mr_free_cq = free_mr->mr_free_cq;
+	struct hns_roce_mr *hr_mr = mr_work->mr;
+	struct device *dev = &hr_dev->pdev->dev;
+	struct ib_wc wc[HNS_ROCE_V1_RESV_QP];
+	struct hns_roce_qp *hr_qp;
 	int ne = 0;
-
-	mr_work = container_of(work, struct hns_roce_mr_free_work, work);
-	hr_mr = (struct hns_roce_mr *)mr_work->mr;
-	hr_dev = to_hr_dev(mr_work->ib_dev);
-	dev = &hr_dev->pdev->dev;
-
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	free_mr = &priv->free_mr;
-	mr_free_cq = free_mr->mr_free_cq;
+	int ret;
+	int i;
 
 	for (i = 0; i < HNS_ROCE_V1_RESV_QP; i++) {
 		hr_qp = free_mr->mr_free_qp[i];
@@ -1096,23 +1061,20 @@
 static int hns_roce_v1_dereg_mr(struct hns_roce_dev *hr_dev,
 				struct hns_roce_mr *mr, struct ib_udata *udata)
 {
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_free_mr *free_mr = &priv->free_mr;
+	long end = HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS;
 	struct device *dev = &hr_dev->pdev->dev;
 	struct hns_roce_mr_free_work *mr_work;
-	struct hns_roce_free_mr *free_mr;
-	struct hns_roce_v1_priv *priv;
-	struct completion comp;
-	long end = HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS;
 	unsigned long start = jiffies;
-	int npages;
+	struct completion comp;
 	int ret = 0;
 
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	free_mr = &priv->free_mr;
-
 	if (mr->enabled) {
-		if (hns_roce_hw2sw_mpt(hr_dev, NULL, key_to_hw_index(mr->key)
-				       & (hr_dev->caps.num_mtpts - 1)))
-			dev_warn(dev, "HW2SW_MPT failed!\n");
+		if (hns_roce_hw_destroy_mpt(hr_dev, NULL,
+					    key_to_hw_index(mr->key) &
+					    (hr_dev->caps.num_mtpts - 1)))
+			dev_warn(dev, "DESTROY_MPT failed!\n");
 	}
 
 	mr_work = kzalloc(sizeof(*mr_work), GFP_KERNEL);
@@ -1149,17 +1111,9 @@
 	dev_dbg(dev, "Free mr 0x%x use 0x%x us.\n",
 		mr->key, jiffies_to_usecs(jiffies) - jiffies_to_usecs(start));
 
-	if (mr->size != ~0ULL) {
-		npages = ib_umem_page_count(mr->umem);
-		dma_free_coherent(dev, npages * 8, mr->pbl_buf,
-				  mr->pbl_dma_addr);
-	}
-
 	hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap,
 			     key_to_hw_index(mr->key), 0);
-
-	ib_umem_release(mr->umem);
-
+	hns_roce_mtr_destroy(hr_dev, &mr->pbl_mtr);
 	kfree(mr);
 
 	return ret;
@@ -1167,12 +1121,9 @@
 
 static void hns_roce_db_free(struct hns_roce_dev *hr_dev)
 {
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_db_table *db = &priv->db_table;
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_v1_priv *priv;
-	struct hns_roce_db_table *db;
-
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	db = &priv->db_table;
 
 	if (db->sdb_ext_mod) {
 		dma_free_coherent(dev, HNS_ROCE_V1_EXT_SDB_SIZE,
@@ -1193,17 +1144,14 @@
 
 static int hns_roce_raq_init(struct hns_roce_dev *hr_dev)
 {
-	int ret;
-	u32 val;
-	__le32 tmp;
-	int raq_shift = 0;
-	dma_addr_t addr;
-	struct hns_roce_v1_priv *priv;
-	struct hns_roce_raq_table *raq;
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_raq_table *raq = &priv->raq_table;
 	struct device *dev = &hr_dev->pdev->dev;
-
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	raq = &priv->raq_table;
+	dma_addr_t addr;
+	int raq_shift;
+	__le32 tmp;
+	u32 val;
+	int ret;
 
 	raq->e_raq_buf = kzalloc(sizeof(*(raq->e_raq_buf)), GFP_KERNEL);
 	if (!raq->e_raq_buf)
@@ -1283,12 +1231,9 @@
 
 static void hns_roce_raq_free(struct hns_roce_dev *hr_dev)
 {
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_raq_table *raq = &priv->raq_table;
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_v1_priv *priv;
-	struct hns_roce_raq_table *raq;
-
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	raq = &priv->raq_table;
 
 	dma_free_coherent(dev, HNS_ROCE_V1_RAQ_SIZE, raq->e_raq_buf->buf,
 			  raq->e_raq_buf->map);
@@ -1322,12 +1267,10 @@
 
 static int hns_roce_bt_init(struct hns_roce_dev *hr_dev)
 {
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_v1_priv *priv;
 	int ret;
 
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-
 	priv->bt_table.qpc_buf.buf = dma_alloc_coherent(dev,
 		HNS_ROCE_BT_RSV_BUF_SIZE, &priv->bt_table.qpc_buf.map,
 		GFP_KERNEL);
@@ -1365,10 +1308,8 @@
 
 static void hns_roce_bt_free(struct hns_roce_dev *hr_dev)
 {
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_v1_priv *priv;
-
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
 
 	dma_free_coherent(dev, HNS_ROCE_BT_RSV_BUF_SIZE,
 		priv->bt_table.cqc_buf.buf, priv->bt_table.cqc_buf.map);
@@ -1382,12 +1323,9 @@
 
 static int hns_roce_tptr_init(struct hns_roce_dev *hr_dev)
 {
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_buf_list *tptr_buf = &priv->tptr_table.tptr_buf;
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_buf_list *tptr_buf;
-	struct hns_roce_v1_priv *priv;
-
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	tptr_buf = &priv->tptr_table.tptr_buf;
 
 	/*
 	 * This buffer will be used for CQ's tptr(tail pointer), also
@@ -1408,12 +1346,9 @@
 
 static void hns_roce_tptr_free(struct hns_roce_dev *hr_dev)
 {
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_buf_list *tptr_buf = &priv->tptr_table.tptr_buf;
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_buf_list *tptr_buf;
-	struct hns_roce_v1_priv *priv;
-
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	tptr_buf = &priv->tptr_table.tptr_buf;
 
 	dma_free_coherent(dev, HNS_ROCE_V1_TPTR_BUF_SIZE,
 			  tptr_buf->buf, tptr_buf->map);
@@ -1421,13 +1356,10 @@
 
 static int hns_roce_free_mr_init(struct hns_roce_dev *hr_dev)
 {
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_free_mr *free_mr = &priv->free_mr;
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_free_mr *free_mr;
-	struct hns_roce_v1_priv *priv;
-	int ret = 0;
-
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	free_mr = &priv->free_mr;
+	int ret;
 
 	free_mr->free_mr_wq = create_singlethread_workqueue("hns_roce_free_mr");
 	if (!free_mr->free_mr_wq) {
@@ -1447,11 +1379,8 @@
 
 static void hns_roce_free_mr_free(struct hns_roce_dev *hr_dev)
 {
-	struct hns_roce_free_mr *free_mr;
-	struct hns_roce_v1_priv *priv;
-
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	free_mr = &priv->free_mr;
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_free_mr *free_mr = &priv->free_mr;
 
 	flush_workqueue(free_mr->free_mr_wq);
 	destroy_workqueue(free_mr->free_mr_wq);
@@ -1510,8 +1439,8 @@
 
 static int hns_roce_v1_profile(struct hns_roce_dev *hr_dev)
 {
-	int i = 0;
 	struct hns_roce_caps *caps = &hr_dev->caps;
+	int i;
 
 	hr_dev->vendor_id = roce_read(hr_dev, ROCEE_VENDOR_ID_REG);
 	hr_dev->vendor_part_id = roce_read(hr_dev, ROCEE_VENDOR_PART_ID_REG);
@@ -1541,12 +1470,12 @@
 	caps->max_qp_dest_rdma	= HNS_ROCE_V1_MAX_QP_DEST_RDMA;
 	caps->max_sq_desc_sz	= HNS_ROCE_V1_MAX_SQ_DESC_SZ;
 	caps->max_rq_desc_sz	= HNS_ROCE_V1_MAX_RQ_DESC_SZ;
-	caps->qpc_entry_sz	= HNS_ROCE_V1_QPC_ENTRY_SIZE;
+	caps->qpc_sz		= HNS_ROCE_V1_QPC_SIZE;
 	caps->irrl_entry_sz	= HNS_ROCE_V1_IRRL_ENTRY_SIZE;
 	caps->cqc_entry_sz	= HNS_ROCE_V1_CQC_ENTRY_SIZE;
 	caps->mtpt_entry_sz	= HNS_ROCE_V1_MTPT_ENTRY_SIZE;
 	caps->mtt_entry_sz	= HNS_ROCE_V1_MTT_ENTRY_SIZE;
-	caps->cq_entry_sz	= HNS_ROCE_V1_CQE_ENTRY_SIZE;
+	caps->cqe_sz		= HNS_ROCE_V1_CQE_SIZE;
 	caps->page_size_cap	= HNS_ROCE_V1_PAGE_SIZE_SUPPORT;
 	caps->reserved_lkey	= 0;
 	caps->reserved_pds	= 0;
@@ -1713,7 +1642,7 @@
 				unsigned long timeout)
 {
 	u8 __iomem *hcr = hr_dev->reg_base + ROCEE_MB1_REG;
-	unsigned long end = 0;
+	unsigned long end;
 	u32 status = 0;
 
 	end = msecs_to_jiffies(timeout) + jiffies;
@@ -1741,7 +1670,7 @@
 {
 	unsigned long flags;
 	u32 *p = NULL;
-	u8 gid_idx = 0;
+	u8 gid_idx;
 
 	gid_idx = hns_get_gid_index(hr_dev, port, gid_index);
 
@@ -1826,12 +1755,15 @@
 		   val);
 }
 
-static int hns_roce_v1_write_mtpt(void *mb_buf, struct hns_roce_mr *mr,
+static int hns_roce_v1_write_mtpt(struct hns_roce_dev *hr_dev, void *mb_buf,
+				  struct hns_roce_mr *mr,
 				  unsigned long mtpt_idx)
 {
+	u64 pages[HNS_ROCE_MAX_INNER_MTPT_NUM] = { 0 };
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_v1_mpt_entry *mpt_entry;
-	struct sg_dma_page_iter sg_iter;
-	u64 *pages;
+	dma_addr_t pbl_ba;
+	int count;
 	int i;
 
 	/* MPT filled into mailbox buf */
@@ -1881,22 +1813,15 @@
 	if (mr->type == MR_TYPE_DMA)
 		return 0;
 
-	pages = (u64 *) __get_free_page(GFP_KERNEL);
-	if (!pages)
-		return -ENOMEM;
-
-	i = 0;
-	for_each_sg_dma_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) {
-		pages[i] = ((u64)sg_page_iter_dma_address(&sg_iter)) >> 12;
-
-		/* Directly record to MTPT table firstly 7 entry */
-		if (i >= HNS_ROCE_MAX_INNER_MTPT_NUM)
-			break;
-		i++;
+	count = hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, pages,
+				  ARRAY_SIZE(pages), &pbl_ba);
+	if (count < 1) {
+		ibdev_err(ibdev, "failed to find PBL mtr, count = %d.", count);
+		return -ENOBUFS;
 	}
 
 	/* Register user mr */
-	for (i = 0; i < HNS_ROCE_MAX_INNER_MTPT_NUM; i++) {
+	for (i = 0; i < count; i++) {
 		switch (i) {
 		case 0:
 			mpt_entry->pa0_l = cpu_to_le32((u32)(pages[i]));
@@ -1962,21 +1887,16 @@
 		}
 	}
 
-	free_page((unsigned long) pages);
-
-	mpt_entry->pbl_addr_l = cpu_to_le32((u32)(mr->pbl_dma_addr));
-
+	mpt_entry->pbl_addr_l = cpu_to_le32(pbl_ba);
 	roce_set_field(mpt_entry->mpt_byte_12, MPT_BYTE_12_PBL_ADDR_H_M,
-		       MPT_BYTE_12_PBL_ADDR_H_S,
-		       ((u32)(mr->pbl_dma_addr >> 32)));
+		       MPT_BYTE_12_PBL_ADDR_H_S, upper_32_bits(pbl_ba));
 
 	return 0;
 }
 
 static void *get_cqe(struct hns_roce_cq *hr_cq, int n)
 {
-	return hns_roce_buf_offset(&hr_cq->hr_buf.hr_buf,
-				   n * HNS_ROCE_V1_CQE_ENTRY_SIZE);
+	return hns_roce_buf_offset(hr_cq->mtr.kmem, n * HNS_ROCE_V1_CQE_SIZE);
 }
 
 static void *get_sw_cqe(struct hns_roce_cq *hr_cq, int n)
@@ -1985,7 +1905,7 @@
 
 	/* Get cqe when Owner bit is Conversely with the MSB of cons_idx */
 	return (roce_get_bit(hr_cqe->cqe_byte_4, CQE_BYTE_4_OWNER_S) ^
-		!!(n & (hr_cq->ib_cq.cqe + 1))) ? hr_cqe : NULL;
+		!!(n & hr_cq->cq_depth)) ? hr_cqe : NULL;
 }
 
 static struct hns_roce_cqe *next_cqe_sw(struct hns_roce_cq *hr_cq)
@@ -2068,19 +1988,14 @@
 
 static void hns_roce_v1_write_cqc(struct hns_roce_dev *hr_dev,
 				  struct hns_roce_cq *hr_cq, void *mb_buf,
-				  u64 *mtts, dma_addr_t dma_handle, int nent,
-				  u32 vector)
+				  u64 *mtts, dma_addr_t dma_handle)
 {
-	struct hns_roce_cq_context *cq_context = NULL;
-	struct hns_roce_buf_list *tptr_buf;
-	struct hns_roce_v1_priv *priv;
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
+	struct hns_roce_buf_list *tptr_buf = &priv->tptr_table.tptr_buf;
+	struct hns_roce_cq_context *cq_context = mb_buf;
 	dma_addr_t tptr_dma_addr;
 	int offset;
 
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-	tptr_buf = &priv->tptr_table.tptr_buf;
-
-	cq_context = mb_buf;
 	memset(cq_context, 0, sizeof(*cq_context));
 
 	/* Get the tptr for this CQ. */
@@ -2104,9 +2019,9 @@
 	roce_set_field(cq_context->cqc_byte_12,
 		       CQ_CONTEXT_CQC_BYTE_12_CQ_CQE_SHIFT_M,
 		       CQ_CONTEXT_CQC_BYTE_12_CQ_CQE_SHIFT_S,
-		       ilog2((unsigned int)nent));
+		       ilog2(hr_cq->cq_depth));
 	roce_set_field(cq_context->cqc_byte_12, CQ_CONTEXT_CQC_BYTE_12_CEQN_M,
-		       CQ_CONTEXT_CQC_BYTE_12_CEQN_S, vector);
+		       CQ_CONTEXT_CQC_BYTE_12_CEQN_S, hr_cq->vector);
 
 	cq_context->cur_cqe_ba0_l = cpu_to_le32((u32)(mtts[0]));
 
@@ -2289,9 +2204,10 @@
 
 	if (is_send) {
 		/* SQ conrespond to CQE */
-		sq_wqe = get_send_wqe(*cur_qp, roce_get_field(cqe->cqe_byte_4,
+		sq_wqe = hns_roce_get_send_wqe(*cur_qp,
+						roce_get_field(cqe->cqe_byte_4,
 						CQE_BYTE_4_WQE_INDEX_M,
-						CQE_BYTE_4_WQE_INDEX_S)&
+						CQE_BYTE_4_WQE_INDEX_S) &
 						((*cur_qp)->sq.wqe_cnt-1));
 		switch (le32_to_cpu(sq_wqe->flag) & HNS_ROCE_WQE_OPCODE_MASK) {
 		case HNS_ROCE_WQE_OPCODE_SEND:
@@ -2420,16 +2336,14 @@
 				 struct hns_roce_hem_table *table, int obj,
 				 int step_idx)
 {
+	struct hns_roce_v1_priv *priv = hr_dev->priv;
 	struct device *dev = &hr_dev->pdev->dev;
-	struct hns_roce_v1_priv *priv;
-	unsigned long flags = 0;
 	long end = HW_SYNC_TIMEOUT_MSECS;
 	__le32 bt_cmd_val[2] = {0};
+	unsigned long flags = 0;
 	void __iomem *bt_cmd;
 	u64 bt_ba = 0;
 
-	priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-
 	switch (table->type) {
 	case HEM_TYPE_QPC:
 		bt_ba = priv->bt_table.qpc_buf.map >> 12;
@@ -2483,7 +2397,6 @@
 }
 
 static int hns_roce_v1_qp_modify(struct hns_roce_dev *hr_dev,
-				 struct hns_roce_mtt *mtt,
 				 enum hns_roce_qp_state cur_state,
 				 enum hns_roce_qp_state new_state,
 				 struct hns_roce_qp_context *context,
@@ -2530,7 +2443,7 @@
 
 	struct hns_roce_cmd_mailbox *mailbox;
 	struct device *dev = &hr_dev->pdev->dev;
-	int ret = 0;
+	int ret;
 
 	if (cur_state >= HNS_ROCE_QP_NUM_STATE ||
 	    new_state >= HNS_ROCE_QP_NUM_STATE ||
@@ -2564,6 +2477,28 @@
 	return ret;
 }
 
+static int find_wqe_mtt(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
+			u64 *sq_ba, u64 *rq_ba, dma_addr_t *bt_ba)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	int count;
+
+	count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, 0, sq_ba, 1, bt_ba);
+	if (count < 1) {
+		ibdev_err(ibdev, "Failed to find SQ ba\n");
+		return -ENOBUFS;
+	}
+
+	count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->rq.offset, rq_ba,
+				  1, NULL);
+	if (!count) {
+		ibdev_err(ibdev, "Failed to find RQ ba\n");
+		return -ENOBUFS;
+	}
+
+	return 0;
+}
+
 static int hns_roce_v1_m_sqp(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
 			     int attr_mask, enum ib_qp_state cur_state,
 			     enum ib_qp_state new_state)
@@ -2571,25 +2506,20 @@
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 	struct hns_roce_sqp_context *context;
-	struct device *dev = &hr_dev->pdev->dev;
 	dma_addr_t dma_handle = 0;
 	u32 __iomem *addr;
-	int rq_pa_start;
+	u64 sq_ba = 0;
+	u64 rq_ba = 0;
 	__le32 tmp;
 	u32 reg_val;
-	u64 *mtts;
 
 	context = kzalloc(sizeof(*context), GFP_KERNEL);
 	if (!context)
 		return -ENOMEM;
 
 	/* Search QP buf's MTTs */
-	mtts = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_table,
-				   hr_qp->mtt.first_seg, &dma_handle);
-	if (!mtts) {
-		dev_err(dev, "qp buf pa find failed\n");
+	if (find_wqe_mtt(hr_dev, hr_qp, &sq_ba, &rq_ba, &dma_handle))
 		goto out;
-	}
 
 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
 		roce_set_field(context->qp1c_bytes_4,
@@ -2603,11 +2533,11 @@
 		roce_set_field(context->qp1c_bytes_4, QP1C_BYTES_4_PD_M,
 			       QP1C_BYTES_4_PD_S, to_hr_pd(ibqp->pd)->pdn);
 
-		context->sq_rq_bt_l = cpu_to_le32((u32)(dma_handle));
+		context->sq_rq_bt_l = cpu_to_le32(dma_handle);
 		roce_set_field(context->qp1c_bytes_12,
 			       QP1C_BYTES_12_SQ_RQ_BT_H_M,
 			       QP1C_BYTES_12_SQ_RQ_BT_H_S,
-			       ((u32)(dma_handle >> 32)));
+			       upper_32_bits(dma_handle));
 
 		roce_set_field(context->qp1c_bytes_16, QP1C_BYTES_16_RQ_HEAD_M,
 			       QP1C_BYTES_16_RQ_HEAD_S, hr_qp->rq.head);
@@ -2628,14 +2558,12 @@
 		roce_set_field(context->qp1c_bytes_20, QP1C_BYTES_20_PKEY_IDX_M,
 			       QP1C_BYTES_20_PKEY_IDX_S, attr->pkey_index);
 
-		rq_pa_start = (u32)hr_qp->rq.offset / PAGE_SIZE;
-		context->cur_rq_wqe_ba_l =
-				cpu_to_le32((u32)(mtts[rq_pa_start]));
+		context->cur_rq_wqe_ba_l = cpu_to_le32(rq_ba);
 
 		roce_set_field(context->qp1c_bytes_28,
 			       QP1C_BYTES_28_CUR_RQ_WQE_BA_H_M,
 			       QP1C_BYTES_28_CUR_RQ_WQE_BA_H_S,
-			       (mtts[rq_pa_start]) >> 32);
+			       upper_32_bits(rq_ba));
 		roce_set_field(context->qp1c_bytes_28,
 			       QP1C_BYTES_28_RQ_CUR_IDX_M,
 			       QP1C_BYTES_28_RQ_CUR_IDX_S, 0);
@@ -2649,12 +2577,12 @@
 			       QP1C_BYTES_32_TX_CQ_NUM_S,
 			       to_hr_cq(ibqp->send_cq)->cqn);
 
-		context->cur_sq_wqe_ba_l  = cpu_to_le32((u32)mtts[0]);
+		context->cur_sq_wqe_ba_l = cpu_to_le32(sq_ba);
 
 		roce_set_field(context->qp1c_bytes_40,
 			       QP1C_BYTES_40_CUR_SQ_WQE_BA_H_M,
 			       QP1C_BYTES_40_CUR_SQ_WQE_BA_H_S,
-			       (mtts[0]) >> 32);
+			       upper_32_bits(sq_ba));
 		roce_set_field(context->qp1c_bytes_40,
 			       QP1C_BYTES_40_SQ_CUR_IDX_M,
 			       QP1C_BYTES_40_SQ_CUR_IDX_S, 0);
@@ -2708,6 +2636,28 @@
 	return -EINVAL;
 }
 
+static bool check_qp_state(enum ib_qp_state cur_state,
+			   enum ib_qp_state new_state)
+{
+	static const bool sm[][IB_QPS_ERR + 1] = {
+		[IB_QPS_RESET] = { [IB_QPS_RESET] = true,
+				   [IB_QPS_INIT] = true },
+		[IB_QPS_INIT] = { [IB_QPS_RESET] = true,
+				  [IB_QPS_INIT] = true,
+				  [IB_QPS_RTR] = true,
+				  [IB_QPS_ERR] = true },
+		[IB_QPS_RTR] = { [IB_QPS_RESET] = true,
+				 [IB_QPS_RTS] = true,
+				 [IB_QPS_ERR] = true },
+		[IB_QPS_RTS] = { [IB_QPS_RESET] = true, [IB_QPS_ERR] = true },
+		[IB_QPS_SQD] = {},
+		[IB_QPS_SQE] = {},
+		[IB_QPS_ERR] = { [IB_QPS_RESET] = true, [IB_QPS_ERR] = true }
+	};
+
+	return sm[cur_state][new_state];
+}
+
 static int hns_roce_v1_m_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
 			    int attr_mask, enum ib_qp_state cur_state,
 			    enum ib_qp_state new_state)
@@ -2720,26 +2670,29 @@
 	dma_addr_t dma_handle_2 = 0;
 	dma_addr_t dma_handle = 0;
 	__le32 doorbell[2] = {0};
-	int rq_pa_start = 0;
 	u64 *mtts_2 = NULL;
 	int ret = -EINVAL;
-	u64 *mtts = NULL;
+	u64 sq_ba = 0;
+	u64 rq_ba = 0;
 	int port;
 	u8 port_num;
 	u8 *dmac;
 	u8 *smac;
 
+	if (!check_qp_state(cur_state, new_state)) {
+		ibdev_err(ibqp->device,
+			  "not support QP(%u) status from %d to %d\n",
+			  ibqp->qp_num, cur_state, new_state);
+		return -EINVAL;
+	}
+
 	context = kzalloc(sizeof(*context), GFP_KERNEL);
 	if (!context)
 		return -ENOMEM;
 
 	/* Search qp buf's mtts */
-	mtts = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_table,
-				   hr_qp->mtt.first_seg, &dma_handle);
-	if (mtts == NULL) {
-		dev_err(dev, "qp buf pa find failed\n");
+	if (find_wqe_mtt(hr_dev, hr_qp, &sq_ba, &rq_ba, &dma_handle))
 		goto out;
-	}
 
 	/* Search IRRL's mtts */
 	mtts_2 = hns_roce_table_find(hr_dev, &hr_dev->qp_table.irrl_table,
@@ -2894,11 +2847,11 @@
 
 		dmac = (u8 *)attr->ah_attr.roce.dmac;
 
-		context->sq_rq_bt_l = cpu_to_le32((u32)(dma_handle));
+		context->sq_rq_bt_l = cpu_to_le32(dma_handle);
 		roce_set_field(context->qpc_bytes_24,
 			       QP_CONTEXT_QPC_BYTES_24_SQ_RQ_BT_H_M,
 			       QP_CONTEXT_QPC_BYTES_24_SQ_RQ_BT_H_S,
-			       ((u32)(dma_handle >> 32)));
+			       upper_32_bits(dma_handle));
 		roce_set_bit(context->qpc_bytes_24,
 			     QP_CONTEXT_QPC_BYTE_24_REMOTE_ENABLE_E2E_CREDITS_S,
 			     1);
@@ -2997,14 +2950,12 @@
 			       QP_CONTEXT_QPC_BYTES_68_RQ_CUR_INDEX_M,
 			       QP_CONTEXT_QPC_BYTES_68_RQ_CUR_INDEX_S, 0);
 
-		rq_pa_start = (u32)hr_qp->rq.offset / PAGE_SIZE;
-		context->cur_rq_wqe_ba_l =
-				cpu_to_le32((u32)(mtts[rq_pa_start]));
+		context->cur_rq_wqe_ba_l = cpu_to_le32(rq_ba);
 
 		roce_set_field(context->qpc_bytes_76,
 			QP_CONTEXT_QPC_BYTES_76_CUR_RQ_WQE_BA_H_M,
 			QP_CONTEXT_QPC_BYTES_76_CUR_RQ_WQE_BA_H_S,
-			mtts[rq_pa_start] >> 32);
+			upper_32_bits(rq_ba));
 		roce_set_field(context->qpc_bytes_76,
 			       QP_CONTEXT_QPC_BYTES_76_RX_REQ_MSN_M,
 			       QP_CONTEXT_QPC_BYTES_76_RX_REQ_MSN_S, 0);
@@ -3066,8 +3017,7 @@
 			       QP_CONTEXT_QPC_BYTES_156_SL_S,
 			       rdma_ah_get_sl(&attr->ah_attr));
 		hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr);
-	} else if (cur_state == IB_QPS_RTR &&
-		new_state == IB_QPS_RTS) {
+	} else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) {
 		/* If exist optional param, return error */
 		if ((attr_mask & IB_QP_ALT_PATH) ||
 		    (attr_mask & IB_QP_ACCESS_FLAGS) ||
@@ -3079,12 +3029,12 @@
 			goto out;
 		}
 
-		context->rx_cur_sq_wqe_ba_l = cpu_to_le32((u32)(mtts[0]));
+		context->rx_cur_sq_wqe_ba_l = cpu_to_le32(sq_ba);
 
 		roce_set_field(context->qpc_bytes_120,
 			       QP_CONTEXT_QPC_BYTES_120_RX_CUR_SQ_WQE_BA_H_M,
 			       QP_CONTEXT_QPC_BYTES_120_RX_CUR_SQ_WQE_BA_H_S,
-			       (mtts[0]) >> 32);
+			       upper_32_bits(sq_ba));
 
 		roce_set_field(context->qpc_bytes_124,
 			       QP_CONTEXT_QPC_BYTES_124_RX_ACK_MSN_M,
@@ -3227,28 +3177,18 @@
 			       QP_CONTEXT_QPC_BYTES_180_SQ_HEAD_M,
 			       QP_CONTEXT_QPC_BYTES_180_SQ_HEAD_S, 0);
 
-		context->tx_cur_sq_wqe_ba_l = cpu_to_le32((u32)(mtts[0]));
+		context->tx_cur_sq_wqe_ba_l = cpu_to_le32(sq_ba);
 
 		roce_set_field(context->qpc_bytes_188,
 			       QP_CONTEXT_QPC_BYTES_188_TX_CUR_SQ_WQE_BA_H_M,
 			       QP_CONTEXT_QPC_BYTES_188_TX_CUR_SQ_WQE_BA_H_S,
-			       (mtts[0]) >> 32);
+			       upper_32_bits(sq_ba));
 		roce_set_bit(context->qpc_bytes_188,
 			     QP_CONTEXT_QPC_BYTES_188_PKT_RETRY_FLG_S, 0);
 		roce_set_field(context->qpc_bytes_188,
 			       QP_CONTEXT_QPC_BYTES_188_TX_RETRY_CUR_INDEX_M,
 			       QP_CONTEXT_QPC_BYTES_188_TX_RETRY_CUR_INDEX_S,
 			       0);
-	} else if (!((cur_state == IB_QPS_INIT && new_state == IB_QPS_RESET) ||
-		   (cur_state == IB_QPS_INIT && new_state == IB_QPS_ERR) ||
-		   (cur_state == IB_QPS_RTR && new_state == IB_QPS_RESET) ||
-		   (cur_state == IB_QPS_RTR && new_state == IB_QPS_ERR) ||
-		   (cur_state == IB_QPS_RTS && new_state == IB_QPS_RESET) ||
-		   (cur_state == IB_QPS_RTS && new_state == IB_QPS_ERR) ||
-		   (cur_state == IB_QPS_ERR && new_state == IB_QPS_RESET) ||
-		   (cur_state == IB_QPS_ERR && new_state == IB_QPS_ERR))) {
-		dev_err(dev, "not support this status migration\n");
-		goto out;
 	}
 
 	/* Every status migrate must change state */
@@ -3257,8 +3197,7 @@
 		       QP_CONTEXT_QPC_BYTES_144_QP_STATE_S, new_state);
 
 	/* SW pass context to HW */
-	ret = hns_roce_v1_qp_modify(hr_dev, &hr_qp->mtt,
-				    to_hns_roce_state(cur_state),
+	ret = hns_roce_v1_qp_modify(hr_dev, to_hns_roce_state(cur_state),
 				    to_hns_roce_state(new_state), context,
 				    hr_qp);
 	if (ret) {
@@ -3453,7 +3392,7 @@
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 	struct device *dev = &hr_dev->pdev->dev;
 	struct hns_roce_qp_context *context;
-	int tmp_qp_state = 0;
+	int tmp_qp_state;
 	int ret = 0;
 	int state;
 
@@ -3609,54 +3548,37 @@
 	if (ret)
 		return ret;
 
-	send_cq = to_hr_cq(hr_qp->ibqp.send_cq);
-	recv_cq = to_hr_cq(hr_qp->ibqp.recv_cq);
+	send_cq = hr_qp->ibqp.send_cq ? to_hr_cq(hr_qp->ibqp.send_cq) : NULL;
+	recv_cq = hr_qp->ibqp.recv_cq ? to_hr_cq(hr_qp->ibqp.recv_cq) : NULL;
 
 	hns_roce_lock_cqs(send_cq, recv_cq);
 	if (!udata) {
-		__hns_roce_v1_cq_clean(recv_cq, hr_qp->qpn, hr_qp->ibqp.srq ?
-				       to_hr_srq(hr_qp->ibqp.srq) : NULL);
-		if (send_cq != recv_cq)
+		if (recv_cq)
+			__hns_roce_v1_cq_clean(recv_cq, hr_qp->qpn,
+					       (hr_qp->ibqp.srq ?
+						to_hr_srq(hr_qp->ibqp.srq) :
+						NULL));
+
+		if (send_cq && send_cq != recv_cq)
 			__hns_roce_v1_cq_clean(send_cq, hr_qp->qpn, NULL);
 	}
+	hns_roce_qp_remove(hr_dev, hr_qp);
 	hns_roce_unlock_cqs(send_cq, recv_cq);
 
-	hns_roce_qp_remove(hr_dev, hr_qp);
-	hns_roce_qp_free(hr_dev, hr_qp);
+	hns_roce_qp_destroy(hr_dev, hr_qp, udata);
 
-	/* RC QP, release QPN */
-	if (hr_qp->ibqp.qp_type == IB_QPT_RC)
-		hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
-
-	hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
-
-	ib_umem_release(hr_qp->umem);
-	if (!udata) {
-		kfree(hr_qp->sq.wrid);
-		kfree(hr_qp->rq.wrid);
-
-		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
-	}
-
-	if (hr_qp->ibqp.qp_type == IB_QPT_RC)
-		kfree(hr_qp);
-	else
-		kfree(hr_to_hr_sqp(hr_qp));
 	return 0;
 }
 
-static void hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibcq->device);
 	struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
 	struct device *dev = &hr_dev->pdev->dev;
 	u32 cqe_cnt_ori;
 	u32 cqe_cnt_cur;
-	u32 cq_buf_size;
 	int wait_time = 0;
 
-	hns_roce_free_cq(hr_dev, hr_cq);
-
 	/*
 	 * Before freeing cq buffer, we need to ensure that the outstanding CQE
 	 * have been written by checking the CQE counter.
@@ -3679,15 +3601,7 @@
 		}
 		wait_time++;
 	}
-
-	hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
-
-	ib_umem_release(hr_cq->umem);
-	if (!udata) {
-		/* Free the buff of stored cq */
-		cq_buf_size = (ibcq->cqe + 1) * hr_dev->caps.cq_entry_sz;
-		hns_roce_buf_free(hr_dev, cq_buf_size, &hr_cq->hr_buf.hr_buf);
-	}
+	return 0;
 }
 
 static void set_eq_cons_index_v1(struct hns_roce_eq *eq, int req_not)
@@ -3860,8 +3774,7 @@
 
 static struct hns_roce_aeqe *get_aeqe_v1(struct hns_roce_eq *eq, u32 entry)
 {
-	unsigned long off = (entry & (eq->entries - 1)) *
-			     HNS_ROCE_AEQ_ENTRY_SIZE;
+	unsigned long off = (entry & (eq->entries - 1)) * HNS_ROCE_AEQE_SIZE;
 
 	return (struct hns_roce_aeqe *)((u8 *)
 		(eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) +
@@ -3955,10 +3868,8 @@
 		eq->cons_index++;
 		aeqes_found = 1;
 
-		if (eq->cons_index > 2 * hr_dev->caps.aeqe_depth - 1) {
-			dev_warn(dev, "cons_index overflow, set back to 0.\n");
+		if (eq->cons_index > 2 * hr_dev->caps.aeqe_depth - 1)
 			eq->cons_index = 0;
-		}
 	}
 
 	set_eq_cons_index_v1(eq, 0);
@@ -3968,8 +3879,7 @@
 
 static struct hns_roce_ceqe *get_ceqe_v1(struct hns_roce_eq *eq, u32 entry)
 {
-	unsigned long off = (entry & (eq->entries - 1)) *
-			     HNS_ROCE_CEQ_ENTRY_SIZE;
+	unsigned long off = (entry & (eq->entries - 1)) * HNS_ROCE_CEQE_SIZE;
 
 	return (struct hns_roce_ceqe *)((u8 *)
 			(eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) +
@@ -4008,11 +3918,8 @@
 		ceqes_found = 1;
 
 		if (eq->cons_index >
-		    EQ_DEPTH_COEFF * hr_dev->caps.ceqe_depth - 1) {
-			dev_warn(&eq->hr_dev->pdev->dev,
-				"cons_index overflow, set back to 0.\n");
+		    EQ_DEPTH_COEFF * hr_dev->caps.ceqe_depth - 1)
 			eq->cons_index = 0;
-		}
 	}
 
 	set_eq_cons_index_v1(eq, 0);
@@ -4024,7 +3931,7 @@
 {
 	struct hns_roce_eq  *eq  = eq_ptr;
 	struct hns_roce_dev *hr_dev = eq->hr_dev;
-	int int_work = 0;
+	int int_work;
 
 	if (eq->type_flag == HNS_ROCE_CEQ)
 		/* CEQ irq routine, CEQ is pulse irq, not clear */
@@ -4222,9 +4129,9 @@
 	void __iomem *eqc = hr_dev->eq_table.eqc_base[eq->eqn];
 	struct device *dev = &hr_dev->pdev->dev;
 	dma_addr_t tmp_dma_addr;
-	u32 eqconsindx_val = 0;
 	u32 eqcuridx_val = 0;
-	u32 eqshift_val = 0;
+	u32 eqconsindx_val;
+	u32 eqshift_val;
 	__le32 tmp2 = 0;
 	__le32 tmp1 = 0;
 	__le32 tmp = 0;
@@ -4343,7 +4250,7 @@
 				       CEQ_REG_OFFSET * i;
 			eq->entries = hr_dev->caps.ceqe_depth;
 			eq->log_entries = ilog2(eq->entries);
-			eq->eqe_size = HNS_ROCE_CEQ_ENTRY_SIZE;
+			eq->eqe_size = HNS_ROCE_CEQE_SIZE;
 		} else {
 			/* AEQ */
 			eq_table->eqc_base[i] = hr_dev->reg_base +
@@ -4353,7 +4260,7 @@
 				       ROCEE_CAEP_AEQE_CONS_IDX_REG;
 			eq->entries = hr_dev->caps.aeqe_depth;
 			eq->log_entries = ilog2(eq->entries);
-			eq->eqe_size = HNS_ROCE_AEQ_ENTRY_SIZE;
+			eq->eqe_size = HNS_ROCE_AEQE_SIZE;
 		}
 	}
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
index 52307b2..ffd0156 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h
@@ -68,13 +68,13 @@
 #define HNS_ROCE_V1_COMP_EQE_NUM			0x8000
 #define HNS_ROCE_V1_ASYNC_EQE_NUM			0x400
 
-#define HNS_ROCE_V1_QPC_ENTRY_SIZE			256
+#define HNS_ROCE_V1_QPC_SIZE				256
 #define HNS_ROCE_V1_IRRL_ENTRY_SIZE			8
 #define HNS_ROCE_V1_CQC_ENTRY_SIZE			64
 #define HNS_ROCE_V1_MTPT_ENTRY_SIZE			64
 #define HNS_ROCE_V1_MTT_ENTRY_SIZE			64
 
-#define HNS_ROCE_V1_CQE_ENTRY_SIZE			32
+#define HNS_ROCE_V1_CQE_SIZE				32
 #define HNS_ROCE_V1_PAGE_SIZE_SUPPORT			0xFFFFF000
 
 #define HNS_ROCE_V1_TABLE_CHUNK_SIZE			(1 << 17)
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index d01e322..abe882e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -33,6 +33,7 @@
 #include <linux/acpi.h>
 #include <linux/etherdevice.h>
 #include <linux/interrupt.h>
+#include <linux/iopoll.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <net/addrconf.h>
@@ -56,224 +57,603 @@
 	dseg->len  = cpu_to_le32(sg->length);
 }
 
+/*
+ * mapped-value = 1 + real-value
+ * The hns wr opcode real value is start from 0, In order to distinguish between
+ * initialized and uninitialized map values, we plus 1 to the actual value when
+ * defining the mapping, so that the validity can be identified by checking the
+ * mapped value is greater than 0.
+ */
+#define HR_OPC_MAP(ib_key, hr_key) \
+		[IB_WR_ ## ib_key] = 1 + HNS_ROCE_V2_WQE_OP_ ## hr_key
+
+static const u32 hns_roce_op_code[] = {
+	HR_OPC_MAP(RDMA_WRITE,			RDMA_WRITE),
+	HR_OPC_MAP(RDMA_WRITE_WITH_IMM,		RDMA_WRITE_WITH_IMM),
+	HR_OPC_MAP(SEND,			SEND),
+	HR_OPC_MAP(SEND_WITH_IMM,		SEND_WITH_IMM),
+	HR_OPC_MAP(RDMA_READ,			RDMA_READ),
+	HR_OPC_MAP(ATOMIC_CMP_AND_SWP,		ATOM_CMP_AND_SWAP),
+	HR_OPC_MAP(ATOMIC_FETCH_AND_ADD,	ATOM_FETCH_AND_ADD),
+	HR_OPC_MAP(SEND_WITH_INV,		SEND_WITH_INV),
+	HR_OPC_MAP(LOCAL_INV,			LOCAL_INV),
+	HR_OPC_MAP(MASKED_ATOMIC_CMP_AND_SWP,	ATOM_MSK_CMP_AND_SWAP),
+	HR_OPC_MAP(MASKED_ATOMIC_FETCH_AND_ADD,	ATOM_MSK_FETCH_AND_ADD),
+	HR_OPC_MAP(REG_MR,			FAST_REG_PMR),
+};
+
+static u32 to_hr_opcode(u32 ib_opcode)
+{
+	if (ib_opcode >= ARRAY_SIZE(hns_roce_op_code))
+		return HNS_ROCE_V2_WQE_OP_MASK;
+
+	return hns_roce_op_code[ib_opcode] ? hns_roce_op_code[ib_opcode] - 1 :
+					     HNS_ROCE_V2_WQE_OP_MASK;
+}
+
 static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
-			 struct hns_roce_wqe_frmr_seg *fseg,
 			 const struct ib_reg_wr *wr)
 {
+	struct hns_roce_wqe_frmr_seg *fseg =
+		(void *)rc_sq_wqe + sizeof(struct hns_roce_v2_rc_send_wqe);
 	struct hns_roce_mr *mr = to_hr_mr(wr->mr);
+	u64 pbl_ba;
 
 	/* use ib_access_flags */
-	roce_set_bit(rc_sq_wqe->byte_4,
-		     V2_RC_FRMR_WQE_BYTE_4_BIND_EN_S,
+	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_FRMR_WQE_BYTE_4_BIND_EN_S,
 		     wr->access & IB_ACCESS_MW_BIND ? 1 : 0);
-	roce_set_bit(rc_sq_wqe->byte_4,
-		     V2_RC_FRMR_WQE_BYTE_4_ATOMIC_S,
+	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_FRMR_WQE_BYTE_4_ATOMIC_S,
 		     wr->access & IB_ACCESS_REMOTE_ATOMIC ? 1 : 0);
-	roce_set_bit(rc_sq_wqe->byte_4,
-		     V2_RC_FRMR_WQE_BYTE_4_RR_S,
+	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_FRMR_WQE_BYTE_4_RR_S,
 		     wr->access & IB_ACCESS_REMOTE_READ ? 1 : 0);
-	roce_set_bit(rc_sq_wqe->byte_4,
-		     V2_RC_FRMR_WQE_BYTE_4_RW_S,
+	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_FRMR_WQE_BYTE_4_RW_S,
 		     wr->access & IB_ACCESS_REMOTE_WRITE ? 1 : 0);
-	roce_set_bit(rc_sq_wqe->byte_4,
-		     V2_RC_FRMR_WQE_BYTE_4_LW_S,
+	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_FRMR_WQE_BYTE_4_LW_S,
 		     wr->access & IB_ACCESS_LOCAL_WRITE ? 1 : 0);
 
 	/* Data structure reuse may lead to confusion */
-	rc_sq_wqe->msg_len = cpu_to_le32(mr->pbl_ba & 0xffffffff);
-	rc_sq_wqe->inv_key = cpu_to_le32(mr->pbl_ba >> 32);
+	pbl_ba = mr->pbl_mtr.hem_cfg.root_ba;
+	rc_sq_wqe->msg_len = cpu_to_le32(lower_32_bits(pbl_ba));
+	rc_sq_wqe->inv_key = cpu_to_le32(upper_32_bits(pbl_ba));
 
 	rc_sq_wqe->byte_16 = cpu_to_le32(wr->mr->length & 0xffffffff);
 	rc_sq_wqe->byte_20 = cpu_to_le32(wr->mr->length >> 32);
 	rc_sq_wqe->rkey = cpu_to_le32(wr->key);
 	rc_sq_wqe->va = cpu_to_le64(wr->mr->iova);
 
-	fseg->pbl_size = cpu_to_le32(mr->pbl_size);
+	fseg->pbl_size = cpu_to_le32(mr->npages);
 	roce_set_field(fseg->mode_buf_pg_sz,
 		       V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_M,
 		       V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_S,
-		       mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET);
+		       to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.buf_pg_shift));
 	roce_set_bit(fseg->mode_buf_pg_sz,
 		     V2_RC_FRMR_WQE_BYTE_40_BLK_MODE_S, 0);
 }
 
-static void set_atomic_seg(struct hns_roce_wqe_atomic_seg *aseg,
-			   const struct ib_atomic_wr *wr)
+static void set_atomic_seg(const struct ib_send_wr *wr,
+			   struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
+			   unsigned int valid_num_sge)
 {
-	if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
-		aseg->fetchadd_swap_data = cpu_to_le64(wr->swap);
-		aseg->cmp_data  = cpu_to_le64(wr->compare_add);
+	struct hns_roce_v2_wqe_data_seg *dseg =
+		(void *)rc_sq_wqe + sizeof(struct hns_roce_v2_rc_send_wqe);
+	struct hns_roce_wqe_atomic_seg *aseg =
+		(void *)dseg + sizeof(struct hns_roce_v2_wqe_data_seg);
+
+	set_data_seg_v2(dseg, wr->sg_list);
+
+	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->fetchadd_swap_data = cpu_to_le64(atomic_wr(wr)->swap);
+		aseg->cmp_data = cpu_to_le64(atomic_wr(wr)->compare_add);
 	} else {
-		aseg->fetchadd_swap_data = cpu_to_le64(wr->compare_add);
-		aseg->cmp_data  = 0;
+		aseg->fetchadd_swap_data =
+			cpu_to_le64(atomic_wr(wr)->compare_add);
+		aseg->cmp_data = 0;
 	}
+
+	roce_set_field(rc_sq_wqe->byte_16, V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M,
+		       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, valid_num_sge);
 }
 
-static void set_extend_sge(struct hns_roce_qp *qp, const struct ib_send_wr *wr,
-			   unsigned int *sge_ind, int valid_num_sge)
+static int fill_ext_sge_inl_data(struct hns_roce_qp *qp,
+				 const struct ib_send_wr *wr,
+				 unsigned int *sge_idx, u32 msg_len)
 {
-	struct hns_roce_v2_wqe_data_seg *dseg;
-	struct ib_sge *sg;
-	int num_in_wqe = 0;
-	int extend_sge_num;
-	int fi_sge_num;
-	int se_sge_num;
-	int shift;
-	int i;
+	struct ib_device *ibdev = &(to_hr_dev(qp->ibqp.device))->ib_dev;
+	unsigned int dseg_len = sizeof(struct hns_roce_v2_wqe_data_seg);
+	unsigned int ext_sge_sz = qp->sq.max_gs * dseg_len;
+	unsigned int left_len_in_pg;
+	unsigned int idx = *sge_idx;
+	unsigned int i = 0;
+	unsigned int len;
+	void *addr;
+	void *dseg;
 
-	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC)
-		num_in_wqe = HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE;
-	extend_sge_num = valid_num_sge - num_in_wqe;
-	sg = wr->sg_list + num_in_wqe;
-	shift = qp->hr_buf.page_shift;
+	if (msg_len > ext_sge_sz) {
+		ibdev_err(ibdev,
+			  "no enough extended sge space for inline data.\n");
+		return -EINVAL;
+	}
 
-	/*
-	 * Check whether wr->num_sge sges are in the same page. If not, we
-	 * should calculate how many sges in the first page and the second
-	 * page.
+	dseg = hns_roce_get_extend_sge(qp, idx & (qp->sge.sge_cnt - 1));
+	left_len_in_pg = hr_hw_page_align((uintptr_t)dseg) - (uintptr_t)dseg;
+	len = wr->sg_list[0].length;
+	addr = (void *)(unsigned long)(wr->sg_list[0].addr);
+
+	/* When copying data to extended sge space, the left length in page may
+	 * not long enough for current user's sge. So the data should be
+	 * splited into several parts, one in the first page, and the others in
+	 * the subsequent pages.
 	 */
-	dseg = get_send_extend_sge(qp, (*sge_ind) & (qp->sge.sge_cnt - 1));
-	fi_sge_num = (round_up((uintptr_t)dseg, 1 << shift) -
-		      (uintptr_t)dseg) /
-		      sizeof(struct hns_roce_v2_wqe_data_seg);
-	if (extend_sge_num > fi_sge_num) {
-		se_sge_num = extend_sge_num - fi_sge_num;
-		for (i = 0; i < fi_sge_num; i++) {
-			set_data_seg_v2(dseg++, sg + i);
-			(*sge_ind)++;
-		}
-		dseg = get_send_extend_sge(qp,
-					   (*sge_ind) & (qp->sge.sge_cnt - 1));
-		for (i = 0; i < se_sge_num; i++) {
-			set_data_seg_v2(dseg++, sg + fi_sge_num + i);
-			(*sge_ind)++;
-		}
-	} else {
-		for (i = 0; i < extend_sge_num; i++) {
-			set_data_seg_v2(dseg++, sg + i);
-			(*sge_ind)++;
+	while (1) {
+		if (len <= left_len_in_pg) {
+			memcpy(dseg, addr, len);
+
+			idx += len / dseg_len;
+
+			i++;
+			if (i >= wr->num_sge)
+				break;
+
+			left_len_in_pg -= len;
+			len = wr->sg_list[i].length;
+			addr = (void *)(unsigned long)(wr->sg_list[i].addr);
+			dseg += len;
+		} else {
+			memcpy(dseg, addr, left_len_in_pg);
+
+			len -= left_len_in_pg;
+			addr += left_len_in_pg;
+			idx += left_len_in_pg / dseg_len;
+			dseg = hns_roce_get_extend_sge(qp,
+						idx & (qp->sge.sge_cnt - 1));
+			left_len_in_pg = 1 << HNS_HW_PAGE_SHIFT;
 		}
 	}
+
+	*sge_idx = idx;
+
+	return 0;
+}
+
+static void set_extend_sge(struct hns_roce_qp *qp, struct ib_sge *sge,
+			   unsigned int *sge_ind, unsigned int cnt)
+{
+	struct hns_roce_v2_wqe_data_seg *dseg;
+	unsigned int idx = *sge_ind;
+
+	while (cnt > 0) {
+		dseg = hns_roce_get_extend_sge(qp, idx & (qp->sge.sge_cnt - 1));
+		if (likely(sge->length)) {
+			set_data_seg_v2(dseg, sge);
+			idx++;
+			cnt--;
+		}
+		sge++;
+	}
+
+	*sge_ind = idx;
+}
+
+static bool check_inl_data_len(struct hns_roce_qp *qp, unsigned int len)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(qp->ibqp.device);
+	int mtu = ib_mtu_enum_to_int(qp->path_mtu);
+
+	if (len > qp->max_inline_data || len > mtu) {
+		ibdev_err(&hr_dev->ib_dev,
+			  "invalid length of data, data len = %u, max inline len = %u, path mtu = %d.\n",
+			  len, qp->max_inline_data, mtu);
+		return false;
+	}
+
+	return true;
+}
+
+static int set_rc_inl(struct hns_roce_qp *qp, const struct ib_send_wr *wr,
+		      struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
+		      unsigned int *sge_idx)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(qp->ibqp.device);
+	u32 msg_len = le32_to_cpu(rc_sq_wqe->msg_len);
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	unsigned int curr_idx = *sge_idx;
+	void *dseg = rc_sq_wqe;
+	unsigned int i;
+	int ret;
+
+	if (unlikely(wr->opcode == IB_WR_RDMA_READ)) {
+		ibdev_err(ibdev, "invalid inline parameters!\n");
+		return -EINVAL;
+	}
+
+	if (!check_inl_data_len(qp, msg_len))
+		return -EINVAL;
+
+	dseg += sizeof(struct hns_roce_v2_rc_send_wqe);
+
+	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_INLINE_S, 1);
+
+	if (msg_len <= HNS_ROCE_V2_MAX_RC_INL_INN_SZ) {
+		roce_set_bit(rc_sq_wqe->byte_20,
+			     V2_RC_SEND_WQE_BYTE_20_INL_TYPE_S, 0);
+
+		for (i = 0; i < wr->num_sge; i++) {
+			memcpy(dseg, ((void *)wr->sg_list[i].addr),
+			       wr->sg_list[i].length);
+			dseg += wr->sg_list[i].length;
+		}
+	} else {
+		roce_set_bit(rc_sq_wqe->byte_20,
+			     V2_RC_SEND_WQE_BYTE_20_INL_TYPE_S, 1);
+
+		ret = fill_ext_sge_inl_data(qp, wr, &curr_idx, msg_len);
+		if (ret)
+			return ret;
+
+		roce_set_field(rc_sq_wqe->byte_16,
+			       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M,
+			       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S,
+			       curr_idx - *sge_idx);
+	}
+
+	*sge_idx = curr_idx;
+
+	return 0;
 }
 
 static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr,
 			     struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
-			     void *wqe, unsigned int *sge_ind,
-			     int valid_num_sge,
-			     const struct ib_send_wr **bad_wr)
+			     unsigned int *sge_ind,
+			     unsigned int valid_num_sge)
 {
-	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
-	struct hns_roce_v2_wqe_data_seg *dseg = wqe;
+	struct hns_roce_v2_wqe_data_seg *dseg =
+		(void *)rc_sq_wqe + sizeof(struct hns_roce_v2_rc_send_wqe);
 	struct hns_roce_qp *qp = to_hr_qp(ibqp);
 	int j = 0;
 	int i;
 
-	if (wr->send_flags & IB_SEND_INLINE && valid_num_sge) {
-		if (le32_to_cpu(rc_sq_wqe->msg_len) >
-		    hr_dev->caps.max_sq_inline) {
-			*bad_wr = wr;
-			dev_err(hr_dev->dev, "inline len(1-%d)=%d, illegal",
-				rc_sq_wqe->msg_len, hr_dev->caps.max_sq_inline);
-			return -EINVAL;
-		}
+	roce_set_field(rc_sq_wqe->byte_20,
+		       V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M,
+		       V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S,
+		       (*sge_ind) & (qp->sge.sge_cnt - 1));
 
-		if (wr->opcode == IB_WR_RDMA_READ) {
-			*bad_wr =  wr;
-			dev_err(hr_dev->dev, "Not support inline data!\n");
-			return -EINVAL;
-		}
+	if (wr->send_flags & IB_SEND_INLINE)
+		return set_rc_inl(qp, wr, rc_sq_wqe, sge_ind);
 
+	if (valid_num_sge <= HNS_ROCE_SGE_IN_WQE) {
 		for (i = 0; i < wr->num_sge; i++) {
-			memcpy(wqe, ((void *)wr->sg_list[i].addr),
-			       wr->sg_list[i].length);
-			wqe += wr->sg_list[i].length;
+			if (likely(wr->sg_list[i].length)) {
+				set_data_seg_v2(dseg, wr->sg_list + i);
+				dseg++;
+			}
 		}
-
-		roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_INLINE_S,
-			     1);
 	} else {
-		if (valid_num_sge <= HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) {
-			for (i = 0; i < wr->num_sge; i++) {
-				if (likely(wr->sg_list[i].length)) {
-					set_data_seg_v2(dseg, wr->sg_list + i);
-					dseg++;
-				}
+		for (i = 0; i < wr->num_sge && j < HNS_ROCE_SGE_IN_WQE; i++) {
+			if (likely(wr->sg_list[i].length)) {
+				set_data_seg_v2(dseg, wr->sg_list + i);
+				dseg++;
+				j++;
 			}
-		} else {
-			roce_set_field(rc_sq_wqe->byte_20,
-				     V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M,
-				     V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S,
-				     (*sge_ind) & (qp->sge.sge_cnt - 1));
-
-			for (i = 0; i < wr->num_sge &&
-			     j < HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE; i++) {
-				if (likely(wr->sg_list[i].length)) {
-					set_data_seg_v2(dseg, wr->sg_list + i);
-					dseg++;
-					j++;
-				}
-			}
-
-			set_extend_sge(qp, wr, sge_ind, valid_num_sge);
 		}
 
-		roce_set_field(rc_sq_wqe->byte_16,
-			       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M,
-			       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, valid_num_sge);
+		set_extend_sge(qp, wr->sg_list + i, sge_ind,
+			       valid_num_sge - HNS_ROCE_SGE_IN_WQE);
+	}
+
+	roce_set_field(rc_sq_wqe->byte_16,
+		       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M,
+		       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, valid_num_sge);
+
+	return 0;
+}
+
+static int check_send_valid(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_qp *hr_qp)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct ib_qp *ibqp = &hr_qp->ibqp;
+
+	if (unlikely(ibqp->qp_type != IB_QPT_RC &&
+		     ibqp->qp_type != IB_QPT_GSI &&
+		     ibqp->qp_type != IB_QPT_UD)) {
+		ibdev_err(ibdev, "Not supported QP(0x%x)type!\n",
+			  ibqp->qp_type);
+		return -EOPNOTSUPP;
+	} else if (unlikely(hr_qp->state == IB_QPS_RESET ||
+		   hr_qp->state == IB_QPS_INIT ||
+		   hr_qp->state == IB_QPS_RTR)) {
+		ibdev_err(ibdev, "failed to post WQE, QP state %hhu!\n",
+			  hr_qp->state);
+		return -EINVAL;
+	} else if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)) {
+		ibdev_err(ibdev, "failed to post WQE, dev state %d!\n",
+			  hr_dev->state);
+		return -EIO;
 	}
 
 	return 0;
 }
 
-static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
-				 const struct ib_qp_attr *attr,
-				 int attr_mask, enum ib_qp_state cur_state,
-				 enum ib_qp_state new_state);
+static unsigned int calc_wr_sge_num(const struct ib_send_wr *wr,
+				    unsigned int *sge_len)
+{
+	unsigned int valid_num = 0;
+	unsigned int len = 0;
+	int i;
+
+	for (i = 0; i < wr->num_sge; i++) {
+		if (likely(wr->sg_list[i].length)) {
+			len += wr->sg_list[i].length;
+			valid_num++;
+		}
+	}
+
+	*sge_len = len;
+	return valid_num;
+}
+
+static __le32 get_immtdata(const struct ib_send_wr *wr)
+{
+	switch (wr->opcode) {
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return cpu_to_le32(be32_to_cpu(wr->ex.imm_data));
+	default:
+		return 0;
+	}
+}
+
+static int set_ud_opcode(struct hns_roce_v2_ud_send_wqe *ud_sq_wqe,
+			 const struct ib_send_wr *wr)
+{
+	u32 ib_op = wr->opcode;
+
+	if (ib_op != IB_WR_SEND && ib_op != IB_WR_SEND_WITH_IMM)
+		return -EINVAL;
+
+	ud_sq_wqe->immtdata = get_immtdata(wr);
+
+	roce_set_field(ud_sq_wqe->byte_4, V2_UD_SEND_WQE_BYTE_4_OPCODE_M,
+		       V2_UD_SEND_WQE_BYTE_4_OPCODE_S, to_hr_opcode(ib_op));
+
+	return 0;
+}
+
+static inline int set_ud_wqe(struct hns_roce_qp *qp,
+			     const struct ib_send_wr *wr,
+			     void *wqe, unsigned int *sge_idx,
+			     unsigned int owner_bit)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(qp->ibqp.device);
+	struct hns_roce_ah *ah = to_hr_ah(ud_wr(wr)->ah);
+	struct hns_roce_v2_ud_send_wqe *ud_sq_wqe = wqe;
+	unsigned int curr_idx = *sge_idx;
+	int valid_num_sge;
+	u32 msg_len = 0;
+	int ret;
+
+	valid_num_sge = calc_wr_sge_num(wr, &msg_len);
+	memset(ud_sq_wqe, 0, sizeof(*ud_sq_wqe));
+
+	ret = set_ud_opcode(ud_sq_wqe, wr);
+	if (WARN_ON(ret))
+		return ret;
+
+	roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_0_M,
+		       V2_UD_SEND_WQE_DMAC_0_S, ah->av.mac[0]);
+	roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_1_M,
+		       V2_UD_SEND_WQE_DMAC_1_S, ah->av.mac[1]);
+	roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_2_M,
+		       V2_UD_SEND_WQE_DMAC_2_S, ah->av.mac[2]);
+	roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_3_M,
+		       V2_UD_SEND_WQE_DMAC_3_S, ah->av.mac[3]);
+	roce_set_field(ud_sq_wqe->byte_48, V2_UD_SEND_WQE_BYTE_48_DMAC_4_M,
+		       V2_UD_SEND_WQE_BYTE_48_DMAC_4_S, ah->av.mac[4]);
+	roce_set_field(ud_sq_wqe->byte_48, V2_UD_SEND_WQE_BYTE_48_DMAC_5_M,
+		       V2_UD_SEND_WQE_BYTE_48_DMAC_5_S, ah->av.mac[5]);
+
+	ud_sq_wqe->msg_len = cpu_to_le32(msg_len);
+
+	/* Set sig attr */
+	roce_set_bit(ud_sq_wqe->byte_4, V2_UD_SEND_WQE_BYTE_4_CQE_S,
+		     (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0);
+
+	/* Set se attr */
+	roce_set_bit(ud_sq_wqe->byte_4, V2_UD_SEND_WQE_BYTE_4_SE_S,
+		     (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0);
+
+	roce_set_bit(ud_sq_wqe->byte_4, V2_UD_SEND_WQE_BYTE_4_OWNER_S,
+		     owner_bit);
+
+	roce_set_field(ud_sq_wqe->byte_16, V2_UD_SEND_WQE_BYTE_16_PD_M,
+		       V2_UD_SEND_WQE_BYTE_16_PD_S, to_hr_pd(qp->ibqp.pd)->pdn);
+
+	roce_set_field(ud_sq_wqe->byte_16, V2_UD_SEND_WQE_BYTE_16_SGE_NUM_M,
+		       V2_UD_SEND_WQE_BYTE_16_SGE_NUM_S, valid_num_sge);
+
+	roce_set_field(ud_sq_wqe->byte_20,
+		       V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M,
+		       V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S,
+		       curr_idx & (qp->sge.sge_cnt - 1));
+
+	roce_set_field(ud_sq_wqe->byte_24, V2_UD_SEND_WQE_BYTE_24_UDPSPN_M,
+		       V2_UD_SEND_WQE_BYTE_24_UDPSPN_S, ah->av.udp_sport);
+	ud_sq_wqe->qkey = cpu_to_le32(ud_wr(wr)->remote_qkey & 0x80000000 ?
+			  qp->qkey : ud_wr(wr)->remote_qkey);
+	roce_set_field(ud_sq_wqe->byte_32, V2_UD_SEND_WQE_BYTE_32_DQPN_M,
+		       V2_UD_SEND_WQE_BYTE_32_DQPN_S, ud_wr(wr)->remote_qpn);
+
+	roce_set_field(ud_sq_wqe->byte_36, V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_M,
+		       V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_S, ah->av.hop_limit);
+	roce_set_field(ud_sq_wqe->byte_36, V2_UD_SEND_WQE_BYTE_36_TCLASS_M,
+		       V2_UD_SEND_WQE_BYTE_36_TCLASS_S, ah->av.tclass);
+	roce_set_field(ud_sq_wqe->byte_40, V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M,
+		       V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S, ah->av.flowlabel);
+	roce_set_field(ud_sq_wqe->byte_40, V2_UD_SEND_WQE_BYTE_40_SL_M,
+		       V2_UD_SEND_WQE_BYTE_40_SL_S, ah->av.sl);
+	roce_set_field(ud_sq_wqe->byte_40, V2_UD_SEND_WQE_BYTE_40_PORTN_M,
+		       V2_UD_SEND_WQE_BYTE_40_PORTN_S, qp->port);
+
+	roce_set_field(ud_sq_wqe->byte_48, V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M,
+		       V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S, ah->av.gid_index);
+
+	if (hr_dev->pci_dev->revision <= PCI_REVISION_ID_HIP08) {
+		roce_set_bit(ud_sq_wqe->byte_40,
+			     V2_UD_SEND_WQE_BYTE_40_UD_VLAN_EN_S,
+			     ah->av.vlan_en);
+		roce_set_field(ud_sq_wqe->byte_36,
+			       V2_UD_SEND_WQE_BYTE_36_VLAN_M,
+			       V2_UD_SEND_WQE_BYTE_36_VLAN_S, ah->av.vlan_id);
+	}
+
+	memcpy(&ud_sq_wqe->dgid[0], &ah->av.dgid[0], GID_LEN_V2);
+
+	set_extend_sge(qp, wr->sg_list, &curr_idx, valid_num_sge);
+
+	*sge_idx = curr_idx;
+
+	return 0;
+}
+
+static int set_rc_opcode(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe,
+			 const struct ib_send_wr *wr)
+{
+	u32 ib_op = wr->opcode;
+
+	rc_sq_wqe->immtdata = get_immtdata(wr);
+
+	switch (ib_op) {
+	case IB_WR_RDMA_READ:
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey);
+		rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr);
+		break;
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+		break;
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		rc_sq_wqe->rkey = cpu_to_le32(atomic_wr(wr)->rkey);
+		rc_sq_wqe->va = cpu_to_le64(atomic_wr(wr)->remote_addr);
+		break;
+	case IB_WR_REG_MR:
+		set_frmr_seg(rc_sq_wqe, reg_wr(wr));
+		break;
+	case IB_WR_LOCAL_INV:
+		roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_SO_S, 1);
+		fallthrough;
+	case IB_WR_SEND_WITH_INV:
+		rc_sq_wqe->inv_key = cpu_to_le32(wr->ex.invalidate_rkey);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
+		       V2_RC_SEND_WQE_BYTE_4_OPCODE_S, to_hr_opcode(ib_op));
+
+	return 0;
+}
+static inline int set_rc_wqe(struct hns_roce_qp *qp,
+			     const struct ib_send_wr *wr,
+			     void *wqe, unsigned int *sge_idx,
+			     unsigned int owner_bit)
+{
+	struct hns_roce_v2_rc_send_wqe *rc_sq_wqe = wqe;
+	unsigned int curr_idx = *sge_idx;
+	unsigned int valid_num_sge;
+	u32 msg_len = 0;
+	int ret;
+
+	valid_num_sge = calc_wr_sge_num(wr, &msg_len);
+	memset(rc_sq_wqe, 0, sizeof(*rc_sq_wqe));
+
+	rc_sq_wqe->msg_len = cpu_to_le32(msg_len);
+
+	ret = set_rc_opcode(rc_sq_wqe, wr);
+	if (WARN_ON(ret))
+		return ret;
+
+	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_FENCE_S,
+		     (wr->send_flags & IB_SEND_FENCE) ? 1 : 0);
+
+	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_SE_S,
+		     (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0);
+
+	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_CQE_S,
+		     (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0);
+
+	roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OWNER_S,
+		     owner_bit);
+
+	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+	    wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+		set_atomic_seg(wr, rc_sq_wqe, valid_num_sge);
+	else if (wr->opcode != IB_WR_REG_MR)
+		ret = set_rwqe_data_seg(&qp->ibqp, wr, rc_sq_wqe,
+					&curr_idx, valid_num_sge);
+
+	*sge_idx = curr_idx;
+
+	return ret;
+}
+
+static inline void update_sq_db(struct hns_roce_dev *hr_dev,
+				struct hns_roce_qp *qp)
+{
+	/*
+	 * Hip08 hardware cannot flush the WQEs in SQ if the QP state
+	 * gets into errored mode. Hence, as a workaround to this
+	 * hardware limitation, driver needs to assist in flushing. But
+	 * the flushing operation uses mailbox to convey the QP state to
+	 * the hardware and which can sleep due to the mutex protection
+	 * around the mailbox calls. Hence, use the deferred flush for
+	 * now.
+	 */
+	if (qp->state == IB_QPS_ERR) {
+		if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG, &qp->flush_flag))
+			init_flush_work(hr_dev, qp);
+	} else {
+		struct hns_roce_v2_db sq_db = {};
+
+		roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_TAG_M,
+			       V2_DB_BYTE_4_TAG_S, qp->doorbell_qpn);
+		roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_CMD_M,
+			       V2_DB_BYTE_4_CMD_S, HNS_ROCE_V2_SQ_DB);
+		roce_set_field(sq_db.parameter, V2_DB_PARAMETER_IDX_M,
+			       V2_DB_PARAMETER_IDX_S, qp->sq.head);
+		roce_set_field(sq_db.parameter, V2_DB_PARAMETER_SL_M,
+			       V2_DB_PARAMETER_SL_S, qp->sl);
+
+		hns_roce_write64(hr_dev, (__le32 *)&sq_db, qp->sq.db_reg_l);
+	}
+}
 
 static int hns_roce_v2_post_send(struct ib_qp *ibqp,
 				 const struct ib_send_wr *wr,
 				 const struct ib_send_wr **bad_wr)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
-	struct hns_roce_ah *ah = to_hr_ah(ud_wr(wr)->ah);
-	struct hns_roce_v2_ud_send_wqe *ud_sq_wqe;
-	struct hns_roce_v2_rc_send_wqe *rc_sq_wqe;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_qp *qp = to_hr_qp(ibqp);
-	struct hns_roce_wqe_frmr_seg *fseg;
-	struct device *dev = hr_dev->dev;
-	struct hns_roce_v2_db sq_db;
-	struct ib_qp_attr attr;
+	unsigned long flags = 0;
 	unsigned int owner_bit;
 	unsigned int sge_idx;
 	unsigned int wqe_idx;
-	unsigned long flags;
-	int valid_num_sge;
 	void *wqe = NULL;
-	bool loopback;
-	int attr_mask;
-	u32 tmp_len;
-	int ret = 0;
-	u32 hr_op;
-	u8 *smac;
 	int nreq;
-	int i;
-
-	if (unlikely(ibqp->qp_type != IB_QPT_RC &&
-		     ibqp->qp_type != IB_QPT_GSI &&
-		     ibqp->qp_type != IB_QPT_UD)) {
-		dev_err(dev, "Not supported QP(0x%x)type!\n", ibqp->qp_type);
-		*bad_wr = wr;
-		return -EOPNOTSUPP;
-	}
-
-	if (unlikely(qp->state == IB_QPS_RESET || qp->state == IB_QPS_INIT ||
-		     qp->state == IB_QPS_RTR)) {
-		dev_err(dev, "Post WQE fail, QP state %d err!\n", qp->state);
-		*bad_wr = wr;
-		return -EINVAL;
-	}
+	int ret;
 
 	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	ret = check_send_valid(hr_dev, qp);
+	if (unlikely(ret)) {
+		*bad_wr = wr;
+		nreq = 0;
+		goto out;
+	}
+
 	sge_idx = qp->next_sge;
 
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
@@ -286,327 +666,37 @@
 		wqe_idx = (qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1);
 
 		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
-			dev_err(dev, "num_sge=%d > qp->sq.max_gs=%d\n",
-				wr->num_sge, qp->sq.max_gs);
+			ibdev_err(ibdev, "num_sge = %d > qp->sq.max_gs = %u.\n",
+				  wr->num_sge, qp->sq.max_gs);
 			ret = -EINVAL;
 			*bad_wr = wr;
 			goto out;
 		}
 
-		wqe = get_send_wqe(qp, wqe_idx);
+		wqe = hns_roce_get_send_wqe(qp, wqe_idx);
 		qp->sq.wrid[wqe_idx] = wr->wr_id;
 		owner_bit =
 		       ~(((qp->sq.head + nreq) >> ilog2(qp->sq.wqe_cnt)) & 0x1);
-		valid_num_sge = 0;
-		tmp_len = 0;
-
-		for (i = 0; i < wr->num_sge; i++) {
-			if (likely(wr->sg_list[i].length)) {
-				tmp_len += wr->sg_list[i].length;
-				valid_num_sge++;
-			}
-		}
 
 		/* Corresponding to the QP type, wqe process separately */
-		if (ibqp->qp_type == IB_QPT_GSI) {
-			ud_sq_wqe = wqe;
-			memset(ud_sq_wqe, 0, sizeof(*ud_sq_wqe));
+		if (ibqp->qp_type == IB_QPT_GSI)
+			ret = set_ud_wqe(qp, wr, wqe, &sge_idx, owner_bit);
+		else if (ibqp->qp_type == IB_QPT_RC)
+			ret = set_rc_wqe(qp, wr, wqe, &sge_idx, owner_bit);
 
-			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_0_M,
-				       V2_UD_SEND_WQE_DMAC_0_S, ah->av.mac[0]);
-			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_1_M,
-				       V2_UD_SEND_WQE_DMAC_1_S, ah->av.mac[1]);
-			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_2_M,
-				       V2_UD_SEND_WQE_DMAC_2_S, ah->av.mac[2]);
-			roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_3_M,
-				       V2_UD_SEND_WQE_DMAC_3_S, ah->av.mac[3]);
-			roce_set_field(ud_sq_wqe->byte_48,
-				       V2_UD_SEND_WQE_BYTE_48_DMAC_4_M,
-				       V2_UD_SEND_WQE_BYTE_48_DMAC_4_S,
-				       ah->av.mac[4]);
-			roce_set_field(ud_sq_wqe->byte_48,
-				       V2_UD_SEND_WQE_BYTE_48_DMAC_5_M,
-				       V2_UD_SEND_WQE_BYTE_48_DMAC_5_S,
-				       ah->av.mac[5]);
-
-			/* MAC loopback */
-			smac = (u8 *)hr_dev->dev_addr[qp->port];
-			loopback = ether_addr_equal_unaligned(ah->av.mac,
-							      smac) ? 1 : 0;
-
-			roce_set_bit(ud_sq_wqe->byte_40,
-				     V2_UD_SEND_WQE_BYTE_40_LBI_S, loopback);
-
-			roce_set_field(ud_sq_wqe->byte_4,
-				       V2_UD_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_UD_SEND_WQE_BYTE_4_OPCODE_S,
-				       HNS_ROCE_V2_WQE_OP_SEND);
-
-			ud_sq_wqe->msg_len =
-			 cpu_to_le32(le32_to_cpu(ud_sq_wqe->msg_len) + tmp_len);
-
-			switch (wr->opcode) {
-			case IB_WR_SEND_WITH_IMM:
-			case IB_WR_RDMA_WRITE_WITH_IMM:
-				ud_sq_wqe->immtdata =
-				      cpu_to_le32(be32_to_cpu(wr->ex.imm_data));
-				break;
-			default:
-				ud_sq_wqe->immtdata = 0;
-				break;
-			}
-
-			/* Set sig attr */
-			roce_set_bit(ud_sq_wqe->byte_4,
-				   V2_UD_SEND_WQE_BYTE_4_CQE_S,
-				   (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0);
-
-			/* Set se attr */
-			roce_set_bit(ud_sq_wqe->byte_4,
-				  V2_UD_SEND_WQE_BYTE_4_SE_S,
-				  (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0);
-
-			roce_set_bit(ud_sq_wqe->byte_4,
-				     V2_UD_SEND_WQE_BYTE_4_OWNER_S, owner_bit);
-
-			roce_set_field(ud_sq_wqe->byte_16,
-				       V2_UD_SEND_WQE_BYTE_16_PD_M,
-				       V2_UD_SEND_WQE_BYTE_16_PD_S,
-				       to_hr_pd(ibqp->pd)->pdn);
-
-			roce_set_field(ud_sq_wqe->byte_16,
-				       V2_UD_SEND_WQE_BYTE_16_SGE_NUM_M,
-				       V2_UD_SEND_WQE_BYTE_16_SGE_NUM_S,
-				       valid_num_sge);
-
-			roce_set_field(ud_sq_wqe->byte_20,
-				     V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M,
-				     V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S,
-				     sge_idx & (qp->sge.sge_cnt - 1));
-
-			roce_set_field(ud_sq_wqe->byte_24,
-				       V2_UD_SEND_WQE_BYTE_24_UDPSPN_M,
-				       V2_UD_SEND_WQE_BYTE_24_UDPSPN_S, 0);
-			ud_sq_wqe->qkey =
-			     cpu_to_le32(ud_wr(wr)->remote_qkey & 0x80000000 ?
-			     qp->qkey : ud_wr(wr)->remote_qkey);
-			roce_set_field(ud_sq_wqe->byte_32,
-				       V2_UD_SEND_WQE_BYTE_32_DQPN_M,
-				       V2_UD_SEND_WQE_BYTE_32_DQPN_S,
-				       ud_wr(wr)->remote_qpn);
-
-			roce_set_field(ud_sq_wqe->byte_36,
-				       V2_UD_SEND_WQE_BYTE_36_VLAN_M,
-				       V2_UD_SEND_WQE_BYTE_36_VLAN_S,
-				       ah->av.vlan);
-			roce_set_field(ud_sq_wqe->byte_36,
-				       V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_M,
-				       V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_S,
-				       ah->av.hop_limit);
-			roce_set_field(ud_sq_wqe->byte_36,
-				       V2_UD_SEND_WQE_BYTE_36_TCLASS_M,
-				       V2_UD_SEND_WQE_BYTE_36_TCLASS_S,
-				       ah->av.tclass);
-			roce_set_field(ud_sq_wqe->byte_40,
-				       V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M,
-				       V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S,
-				       ah->av.flowlabel);
-			roce_set_field(ud_sq_wqe->byte_40,
-				       V2_UD_SEND_WQE_BYTE_40_SL_M,
-				       V2_UD_SEND_WQE_BYTE_40_SL_S,
-				       ah->av.sl);
-			roce_set_field(ud_sq_wqe->byte_40,
-				       V2_UD_SEND_WQE_BYTE_40_PORTN_M,
-				       V2_UD_SEND_WQE_BYTE_40_PORTN_S,
-				       qp->port);
-
-			roce_set_bit(ud_sq_wqe->byte_40,
-				     V2_UD_SEND_WQE_BYTE_40_UD_VLAN_EN_S,
-				     ah->av.vlan_en ? 1 : 0);
-			roce_set_field(ud_sq_wqe->byte_48,
-				       V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M,
-				       V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S,
-				       hns_get_gid_index(hr_dev, qp->phy_port,
-							 ah->av.gid_index));
-
-			memcpy(&ud_sq_wqe->dgid[0], &ah->av.dgid[0],
-			       GID_LEN_V2);
-
-			set_extend_sge(qp, wr, &sge_idx, valid_num_sge);
-		} else if (ibqp->qp_type == IB_QPT_RC) {
-			rc_sq_wqe = wqe;
-			memset(rc_sq_wqe, 0, sizeof(*rc_sq_wqe));
-
-			rc_sq_wqe->msg_len =
-			 cpu_to_le32(le32_to_cpu(rc_sq_wqe->msg_len) + tmp_len);
-
-			switch (wr->opcode) {
-			case IB_WR_SEND_WITH_IMM:
-			case IB_WR_RDMA_WRITE_WITH_IMM:
-				rc_sq_wqe->immtdata =
-				      cpu_to_le32(be32_to_cpu(wr->ex.imm_data));
-				break;
-			case IB_WR_SEND_WITH_INV:
-				rc_sq_wqe->inv_key =
-					cpu_to_le32(wr->ex.invalidate_rkey);
-				break;
-			default:
-				rc_sq_wqe->immtdata = 0;
-				break;
-			}
-
-			roce_set_bit(rc_sq_wqe->byte_4,
-				     V2_RC_SEND_WQE_BYTE_4_FENCE_S,
-				     (wr->send_flags & IB_SEND_FENCE) ? 1 : 0);
-
-			roce_set_bit(rc_sq_wqe->byte_4,
-				  V2_RC_SEND_WQE_BYTE_4_SE_S,
-				  (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0);
-
-			roce_set_bit(rc_sq_wqe->byte_4,
-				   V2_RC_SEND_WQE_BYTE_4_CQE_S,
-				   (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0);
-
-			roce_set_bit(rc_sq_wqe->byte_4,
-				     V2_RC_SEND_WQE_BYTE_4_OWNER_S, owner_bit);
-
-			wqe += sizeof(struct hns_roce_v2_rc_send_wqe);
-			switch (wr->opcode) {
-			case IB_WR_RDMA_READ:
-				hr_op = HNS_ROCE_V2_WQE_OP_RDMA_READ;
-				rc_sq_wqe->rkey =
-					cpu_to_le32(rdma_wr(wr)->rkey);
-				rc_sq_wqe->va =
-					cpu_to_le64(rdma_wr(wr)->remote_addr);
-				break;
-			case IB_WR_RDMA_WRITE:
-				hr_op = HNS_ROCE_V2_WQE_OP_RDMA_WRITE;
-				rc_sq_wqe->rkey =
-					cpu_to_le32(rdma_wr(wr)->rkey);
-				rc_sq_wqe->va =
-					cpu_to_le64(rdma_wr(wr)->remote_addr);
-				break;
-			case IB_WR_RDMA_WRITE_WITH_IMM:
-				hr_op = HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM;
-				rc_sq_wqe->rkey =
-					cpu_to_le32(rdma_wr(wr)->rkey);
-				rc_sq_wqe->va =
-					cpu_to_le64(rdma_wr(wr)->remote_addr);
-				break;
-			case IB_WR_SEND:
-				hr_op = HNS_ROCE_V2_WQE_OP_SEND;
-				break;
-			case IB_WR_SEND_WITH_INV:
-				hr_op = HNS_ROCE_V2_WQE_OP_SEND_WITH_INV;
-				break;
-			case IB_WR_SEND_WITH_IMM:
-				hr_op = HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM;
-				break;
-			case IB_WR_LOCAL_INV:
-				hr_op = HNS_ROCE_V2_WQE_OP_LOCAL_INV;
-				roce_set_bit(rc_sq_wqe->byte_4,
-					       V2_RC_SEND_WQE_BYTE_4_SO_S, 1);
-				rc_sq_wqe->inv_key =
-					    cpu_to_le32(wr->ex.invalidate_rkey);
-				break;
-			case IB_WR_REG_MR:
-				hr_op = HNS_ROCE_V2_WQE_OP_FAST_REG_PMR;
-				fseg = wqe;
-				set_frmr_seg(rc_sq_wqe, fseg, reg_wr(wr));
-				break;
-			case IB_WR_ATOMIC_CMP_AND_SWP:
-				hr_op = HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP;
-				rc_sq_wqe->rkey =
-					cpu_to_le32(atomic_wr(wr)->rkey);
-				rc_sq_wqe->va =
-					cpu_to_le64(atomic_wr(wr)->remote_addr);
-				break;
-			case IB_WR_ATOMIC_FETCH_AND_ADD:
-				hr_op = HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD;
-				rc_sq_wqe->rkey =
-					cpu_to_le32(atomic_wr(wr)->rkey);
-				rc_sq_wqe->va =
-					cpu_to_le64(atomic_wr(wr)->remote_addr);
-				break;
-			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
-				hr_op =
-				       HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP;
-				break;
-			case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
-				hr_op =
-				      HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD;
-				break;
-			default:
-				hr_op = HNS_ROCE_V2_WQE_OP_MASK;
-				break;
-			}
-
-			roce_set_field(rc_sq_wqe->byte_4,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_M,
-				       V2_RC_SEND_WQE_BYTE_4_OPCODE_S, hr_op);
-
-			if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
-			    wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
-				struct hns_roce_v2_wqe_data_seg *dseg;
-
-				dseg = wqe;
-				set_data_seg_v2(dseg, wr->sg_list);
-				wqe += sizeof(struct hns_roce_v2_wqe_data_seg);
-				set_atomic_seg(wqe, atomic_wr(wr));
-				roce_set_field(rc_sq_wqe->byte_16,
-					       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M,
-					       V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S,
-					       valid_num_sge);
-			} else if (wr->opcode != IB_WR_REG_MR) {
-				ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe,
-							wqe, &sge_idx,
-							valid_num_sge, bad_wr);
-				if (ret)
-					goto out;
-			}
-		} else {
-			dev_err(dev, "Illegal qp_type(0x%x)\n", ibqp->qp_type);
-			spin_unlock_irqrestore(&qp->sq.lock, flags);
+		if (unlikely(ret)) {
 			*bad_wr = wr;
-			return -EOPNOTSUPP;
+			goto out;
 		}
 	}
 
 out:
 	if (likely(nreq)) {
 		qp->sq.head += nreq;
+		qp->next_sge = sge_idx;
 		/* Memory barrier */
 		wmb();
-
-		sq_db.byte_4 = 0;
-		sq_db.parameter = 0;
-
-		roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_TAG_M,
-			       V2_DB_BYTE_4_TAG_S, qp->doorbell_qpn);
-		roce_set_field(sq_db.byte_4, V2_DB_BYTE_4_CMD_M,
-			       V2_DB_BYTE_4_CMD_S, HNS_ROCE_V2_SQ_DB);
-		roce_set_field(sq_db.parameter, V2_DB_PARAMETER_IDX_M,
-			       V2_DB_PARAMETER_IDX_S,
-			       qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1));
-		roce_set_field(sq_db.parameter, V2_DB_PARAMETER_SL_M,
-			       V2_DB_PARAMETER_SL_S, qp->sl);
-
-		hns_roce_write64(hr_dev, (__le32 *)&sq_db, qp->sq.db_reg_l);
-
-		qp->next_sge = sge_idx;
-
-		if (qp->state == IB_QPS_ERR) {
-			attr_mask = IB_QP_STATE;
-			attr.qp_state = IB_QPS_ERR;
-
-			ret = hns_roce_v2_modify_qp(&qp->ibqp, &attr, attr_mask,
-						    qp->state, IB_QPS_ERR);
-			if (ret) {
-				spin_unlock_irqrestore(&qp->sq.lock, flags);
-				*bad_wr = wr;
-				return ret;
-			}
-		}
+		update_sq_db(hr_dev, qp);
 	}
 
 	spin_unlock_irqrestore(&qp->sq.lock, flags);
@@ -614,35 +704,45 @@
 	return ret;
 }
 
+static int check_recv_valid(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_qp *hr_qp)
+{
+	if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN))
+		return -EIO;
+	else if (hr_qp->state == IB_QPS_RESET)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int hns_roce_v2_post_recv(struct ib_qp *ibqp,
 				 const struct ib_recv_wr *wr,
 				 const struct ib_recv_wr **bad_wr)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_v2_wqe_data_seg *dseg;
 	struct hns_roce_rinl_sge *sge_list;
-	struct device *dev = hr_dev->dev;
-	struct ib_qp_attr attr;
 	unsigned long flags;
 	void *wqe = NULL;
-	int attr_mask;
 	u32 wqe_idx;
-	int ret = 0;
 	int nreq;
+	int ret;
 	int i;
 
 	spin_lock_irqsave(&hr_qp->rq.lock, flags);
 
-	if (hr_qp->state == IB_QPS_RESET) {
-		spin_unlock_irqrestore(&hr_qp->rq.lock, flags);
+	ret = check_recv_valid(hr_dev, hr_qp);
+	if (unlikely(ret)) {
 		*bad_wr = wr;
-		return -EINVAL;
+		nreq = 0;
+		goto out;
 	}
 
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
-		if (hns_roce_wq_overflow(&hr_qp->rq, nreq,
-			hr_qp->ibqp.recv_cq)) {
+		if (unlikely(hns_roce_wq_overflow(&hr_qp->rq, nreq,
+						  hr_qp->ibqp.recv_cq))) {
 			ret = -ENOMEM;
 			*bad_wr = wr;
 			goto out;
@@ -651,14 +751,14 @@
 		wqe_idx = (hr_qp->rq.head + nreq) & (hr_qp->rq.wqe_cnt - 1);
 
 		if (unlikely(wr->num_sge > hr_qp->rq.max_gs)) {
-			dev_err(dev, "rq:num_sge=%d > qp->sq.max_gs=%d\n",
-				wr->num_sge, hr_qp->rq.max_gs);
+			ibdev_err(ibdev, "num_sge = %d >= max_sge = %u.\n",
+				  wr->num_sge, hr_qp->rq.max_gs);
 			ret = -EINVAL;
 			*bad_wr = wr;
 			goto out;
 		}
 
-		wqe = get_recv_wqe(hr_qp, wqe_idx);
+		wqe = hns_roce_get_recv_wqe(hr_qp, wqe_idx);
 		dseg = (struct hns_roce_v2_wqe_data_seg *)wqe;
 		for (i = 0; i < wr->num_sge; i++) {
 			if (!wr->sg_list[i].length)
@@ -667,13 +767,13 @@
 			dseg++;
 		}
 
-		if (i < hr_qp->rq.max_gs) {
+		if (wr->num_sge < hr_qp->rq.max_gs) {
 			dseg->lkey = cpu_to_le32(HNS_ROCE_INVALID_LKEY);
 			dseg->addr = 0;
 		}
 
 		/* rq support inline data */
-		if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) {
+		if (hr_qp->rq_inl_buf.wqe_cnt) {
 			sge_list = hr_qp->rq_inl_buf.wqe_list[wqe_idx].sg_list;
 			hr_qp->rq_inl_buf.wqe_list[wqe_idx].sge_cnt =
 							       (u32)wr->num_sge;
@@ -693,20 +793,21 @@
 		/* Memory barrier */
 		wmb();
 
-		*hr_qp->rdb.db_record = hr_qp->rq.head & 0xffff;
-
+		/*
+		 * Hip08 hardware cannot flush the WQEs in RQ if the QP state
+		 * gets into errored mode. Hence, as a workaround to this
+		 * hardware limitation, driver needs to assist in flushing. But
+		 * the flushing operation uses mailbox to convey the QP state to
+		 * the hardware and which can sleep due to the mutex protection
+		 * around the mailbox calls. Hence, use the deferred flush for
+		 * now.
+		 */
 		if (hr_qp->state == IB_QPS_ERR) {
-			attr_mask = IB_QP_STATE;
-			attr.qp_state = IB_QPS_ERR;
-
-			ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, &attr,
-						    attr_mask, hr_qp->state,
-						    IB_QPS_ERR);
-			if (ret) {
-				spin_unlock_irqrestore(&hr_qp->rq.lock, flags);
-				*bad_wr = wr;
-				return ret;
-			}
+			if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG,
+					      &hr_qp->flush_flag))
+				init_flush_work(hr_dev, hr_qp);
+		} else {
+			*hr_qp->rdb.db_record = hr_qp->rq.head & 0xffff;
 		}
 	}
 	spin_unlock_irqrestore(&hr_qp->rq.lock, flags);
@@ -714,6 +815,129 @@
 	return ret;
 }
 
+static void *get_srq_wqe(struct hns_roce_srq *srq, int n)
+{
+	return hns_roce_buf_offset(srq->buf_mtr.kmem, n << srq->wqe_shift);
+}
+
+static void *get_idx_buf(struct hns_roce_idx_que *idx_que, int n)
+{
+	return hns_roce_buf_offset(idx_que->mtr.kmem,
+				   n << idx_que->entry_shift);
+}
+
+static void hns_roce_free_srq_wqe(struct hns_roce_srq *srq, int wqe_index)
+{
+	/* always called with interrupts disabled. */
+	spin_lock(&srq->lock);
+
+	bitmap_clear(srq->idx_que.bitmap, wqe_index, 1);
+	srq->tail++;
+
+	spin_unlock(&srq->lock);
+}
+
+static int find_empty_entry(struct hns_roce_idx_que *idx_que,
+			    unsigned long size)
+{
+	int wqe_idx;
+
+	if (unlikely(bitmap_full(idx_que->bitmap, size)))
+		return -ENOSPC;
+
+	wqe_idx = find_first_zero_bit(idx_que->bitmap, size);
+
+	bitmap_set(idx_que->bitmap, wqe_idx, 1);
+
+	return wqe_idx;
+}
+
+static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
+				     const struct ib_recv_wr *wr,
+				     const struct ib_recv_wr **bad_wr)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
+	struct hns_roce_srq *srq = to_hr_srq(ibsrq);
+	struct hns_roce_v2_wqe_data_seg *dseg;
+	struct hns_roce_v2_db srq_db;
+	unsigned long flags;
+	__le32 *srq_idx;
+	int ret = 0;
+	int wqe_idx;
+	void *wqe;
+	int nreq;
+	int ind;
+	int i;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	ind = srq->head & (srq->wqe_cnt - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (unlikely(wr->num_sge >= srq->max_gs)) {
+			ret = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		if (unlikely(srq->head == srq->tail)) {
+			ret = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		wqe_idx = find_empty_entry(&srq->idx_que, srq->wqe_cnt);
+		if (unlikely(wqe_idx < 0)) {
+			ret = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		wqe = get_srq_wqe(srq, wqe_idx);
+		dseg = (struct hns_roce_v2_wqe_data_seg *)wqe;
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			dseg[i].len = cpu_to_le32(wr->sg_list[i].length);
+			dseg[i].lkey = cpu_to_le32(wr->sg_list[i].lkey);
+			dseg[i].addr = cpu_to_le64(wr->sg_list[i].addr);
+		}
+
+		if (wr->num_sge < srq->max_gs) {
+			dseg[i].len = 0;
+			dseg[i].lkey = cpu_to_le32(0x100);
+			dseg[i].addr = 0;
+		}
+
+		srq_idx = get_idx_buf(&srq->idx_que, ind);
+		*srq_idx = cpu_to_le32(wqe_idx);
+
+		srq->wrid[wqe_idx] = wr->wr_id;
+		ind = (ind + 1) & (srq->wqe_cnt - 1);
+	}
+
+	if (likely(nreq)) {
+		srq->head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		srq_db.byte_4 =
+			cpu_to_le32(HNS_ROCE_V2_SRQ_DB << V2_DB_BYTE_4_CMD_S |
+				    (srq->srqn & V2_DB_BYTE_4_TAG_M));
+		srq_db.parameter =
+			cpu_to_le32(srq->head & V2_DB_PARAMETER_IDX_M);
+
+		hns_roce_write64(hr_dev, (__le32 *)&srq_db, srq->db_reg_l);
+	}
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return ret;
+}
+
 static int hns_roce_v2_cmd_hw_reseted(struct hns_roce_dev *hr_dev,
 				      unsigned long instance_stage,
 				      unsigned long reset_stage)
@@ -741,9 +965,14 @@
 					unsigned long instance_stage,
 					unsigned long reset_stage)
 {
-	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+#define HW_RESET_TIMEOUT_US 1000000
+#define HW_RESET_SLEEP_US 1000
+
+	struct hns_roce_v2_priv *priv = hr_dev->priv;
 	struct hnae3_handle *handle = priv->handle;
 	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+	unsigned long val;
+	int ret;
 
 	/* When hardware reset is detected, we should stop sending mailbox&cmq&
 	 * doorbell to hardware. If now in .init_instance() function, we should
@@ -755,7 +984,11 @@
 	 * again.
 	 */
 	hr_dev->dis_db = true;
-	if (!ops->get_hw_reset_stat(handle))
+
+	ret = read_poll_timeout(ops->ae_dev_reset_cnt, val,
+				val > hr_dev->reset_cnt, HW_RESET_SLEEP_US,
+				HW_RESET_TIMEOUT_US, false, handle);
+	if (!ret)
 		hr_dev->is_reset = true;
 
 	if (!hr_dev->is_reset || reset_stage == HNS_ROCE_STATE_RST_INIT ||
@@ -767,7 +1000,7 @@
 
 static int hns_roce_v2_cmd_sw_resetting(struct hns_roce_dev *hr_dev)
 {
-	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_priv *priv = hr_dev->priv;
 	struct hnae3_handle *handle = priv->handle;
 	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
 
@@ -784,7 +1017,7 @@
 
 static int hns_roce_v2_rst_process_cmd(struct hns_roce_dev *hr_dev)
 {
-	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_priv *priv = hr_dev->priv;
 	struct hnae3_handle *handle = priv->handle;
 	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
 	unsigned long instance_stage;	/* the current instance stage */
@@ -806,7 +1039,7 @@
 	instance_stage = handle->rinfo.instance_state;
 	reset_stage = handle->rinfo.reset_state;
 	reset_cnt = ops->ae_dev_reset_cnt(handle);
-	hw_resetting = ops->get_hw_reset_stat(handle);
+	hw_resetting = ops->get_cmdq_stat(handle);
 	sw_resetting = ops->ae_dev_resetting(handle);
 
 	if (reset_cnt != hr_dev->reset_cnt)
@@ -864,7 +1097,7 @@
 
 static int hns_roce_init_cmq_ring(struct hns_roce_dev *hr_dev, bool ring_type)
 {
-	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_priv *priv = hr_dev->priv;
 	struct hns_roce_v2_cmq_ring *ring = (ring_type == TYPE_CSQ) ?
 					    &priv->cmq.csq : &priv->cmq.crq;
 
@@ -877,7 +1110,7 @@
 
 static void hns_roce_cmq_init_regs(struct hns_roce_dev *hr_dev, bool ring_type)
 {
-	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_priv *priv = hr_dev->priv;
 	struct hns_roce_v2_cmq_ring *ring = (ring_type == TYPE_CSQ) ?
 					    &priv->cmq.csq : &priv->cmq.crq;
 	dma_addr_t dma = ring->desc_dma_addr;
@@ -903,7 +1136,7 @@
 
 static int hns_roce_v2_cmq_init(struct hns_roce_dev *hr_dev)
 {
-	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_priv *priv = hr_dev->priv;
 	int ret;
 
 	/* Setup the queue entries for command queue */
@@ -947,7 +1180,7 @@
 
 static void hns_roce_v2_cmq_exit(struct hns_roce_dev *hr_dev)
 {
-	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_priv *priv = hr_dev->priv;
 
 	hns_roce_free_cmq_desc(hr_dev, &priv->cmq.csq);
 	hns_roce_free_cmq_desc(hr_dev, &priv->cmq.crq);
@@ -969,15 +1202,15 @@
 
 static int hns_roce_cmq_csq_done(struct hns_roce_dev *hr_dev)
 {
-	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
 	u32 head = roce_read(hr_dev, ROCEE_TX_CMQ_HEAD_REG);
+	struct hns_roce_v2_priv *priv = hr_dev->priv;
 
 	return head == priv->cmq.csq.next_to_use;
 }
 
 static int hns_roce_cmq_csq_clean(struct hns_roce_dev *hr_dev)
 {
-	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_priv *priv = hr_dev->priv;
 	struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq;
 	struct hns_roce_cmq_desc *desc;
 	u16 ntc = csq->next_to_clean;
@@ -1002,7 +1235,7 @@
 static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
 			       struct hns_roce_cmq_desc *desc, int num)
 {
-	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_priv *priv = hr_dev->priv;
 	struct hns_roce_v2_cmq_ring *csq = &priv->cmq.csq;
 	struct hns_roce_cmq_desc *desc_to_use;
 	bool complete = false;
@@ -1129,7 +1362,7 @@
 
 static bool hns_roce_func_clr_chk_rst(struct hns_roce_dev *hr_dev)
 {
-	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_priv *priv = hr_dev->priv;
 	struct hnae3_handle *handle = priv->handle;
 	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
 	unsigned long reset_cnt;
@@ -1149,7 +1382,7 @@
 static void hns_roce_func_clr_rst_prc(struct hns_roce_dev *hr_dev, int retval,
 				      int flag)
 {
-	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_priv *priv = hr_dev->priv;
 	struct hnae3_handle *handle = priv->handle;
 	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
 	unsigned long instance_stage;
@@ -1256,7 +1489,6 @@
 	}
 
 out:
-	dev_err(hr_dev->dev, "Func clear fail.\n");
 	hns_roce_func_clr_rst_prc(hr_dev, ret, fclr_write_fail_flag);
 }
 
@@ -1372,8 +1604,7 @@
 	return 0;
 }
 
-static int hns_roce_set_vf_switch_param(struct hns_roce_dev *hr_dev,
-						  int vf_id)
+static int hns_roce_set_vf_switch_param(struct hns_roce_dev *hr_dev, int vf_id)
 {
 	struct hns_roce_cmq_desc desc;
 	struct hns_roce_vf_switch *swt;
@@ -1382,13 +1613,12 @@
 	swt = (struct hns_roce_vf_switch *)desc.data;
 	hns_roce_cmq_setup_basic_desc(&desc, HNS_SWITCH_PARAMETER_CFG, true);
 	swt->rocee_sel |= cpu_to_le32(HNS_ICL_SWITCH_CMD_ROCEE_SEL);
-	roce_set_field(swt->fun_id,
-			VF_SWITCH_DATA_FUN_ID_VF_ID_M,
-			VF_SWITCH_DATA_FUN_ID_VF_ID_S,
-			vf_id);
+	roce_set_field(swt->fun_id, VF_SWITCH_DATA_FUN_ID_VF_ID_M,
+		       VF_SWITCH_DATA_FUN_ID_VF_ID_S, vf_id);
 	ret = hns_roce_cmq_send(hr_dev, &desc, 1);
 	if (ret)
 		return ret;
+
 	desc.flag =
 		cpu_to_le16(HNS_ROCE_CMD_FLAG_NO_INTR | HNS_ROCE_CMD_FLAG_IN);
 	desc.flag &= cpu_to_le16(~HNS_ROCE_CMD_FLAG_WR);
@@ -1408,8 +1638,6 @@
 
 	req_a = (struct hns_roce_vf_res_a *)desc[0].data;
 	req_b = (struct hns_roce_vf_res_b *)desc[1].data;
-	memset(req_a, 0, sizeof(*req_a));
-	memset(req_b, 0, sizeof(*req_b));
 	for (i = 0; i < 2; i++) {
 		hns_roce_cmq_setup_basic_desc(&desc[i],
 					      HNS_ROCE_OPC_ALLOC_VF_RES, false);
@@ -1418,82 +1646,63 @@
 			desc[i].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
 		else
 			desc[i].flag &= ~cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
-
-		if (i == 0) {
-			roce_set_field(req_a->vf_qpc_bt_idx_num,
-				       VF_RES_A_DATA_1_VF_QPC_BT_IDX_M,
-				       VF_RES_A_DATA_1_VF_QPC_BT_IDX_S, 0);
-			roce_set_field(req_a->vf_qpc_bt_idx_num,
-				       VF_RES_A_DATA_1_VF_QPC_BT_NUM_M,
-				       VF_RES_A_DATA_1_VF_QPC_BT_NUM_S,
-				       HNS_ROCE_VF_QPC_BT_NUM);
-
-			roce_set_field(req_a->vf_srqc_bt_idx_num,
-				       VF_RES_A_DATA_2_VF_SRQC_BT_IDX_M,
-				       VF_RES_A_DATA_2_VF_SRQC_BT_IDX_S, 0);
-			roce_set_field(req_a->vf_srqc_bt_idx_num,
-				       VF_RES_A_DATA_2_VF_SRQC_BT_NUM_M,
-				       VF_RES_A_DATA_2_VF_SRQC_BT_NUM_S,
-				       HNS_ROCE_VF_SRQC_BT_NUM);
-
-			roce_set_field(req_a->vf_cqc_bt_idx_num,
-				       VF_RES_A_DATA_3_VF_CQC_BT_IDX_M,
-				       VF_RES_A_DATA_3_VF_CQC_BT_IDX_S, 0);
-			roce_set_field(req_a->vf_cqc_bt_idx_num,
-				       VF_RES_A_DATA_3_VF_CQC_BT_NUM_M,
-				       VF_RES_A_DATA_3_VF_CQC_BT_NUM_S,
-				       HNS_ROCE_VF_CQC_BT_NUM);
-
-			roce_set_field(req_a->vf_mpt_bt_idx_num,
-				       VF_RES_A_DATA_4_VF_MPT_BT_IDX_M,
-				       VF_RES_A_DATA_4_VF_MPT_BT_IDX_S, 0);
-			roce_set_field(req_a->vf_mpt_bt_idx_num,
-				       VF_RES_A_DATA_4_VF_MPT_BT_NUM_M,
-				       VF_RES_A_DATA_4_VF_MPT_BT_NUM_S,
-				       HNS_ROCE_VF_MPT_BT_NUM);
-
-			roce_set_field(req_a->vf_eqc_bt_idx_num,
-				       VF_RES_A_DATA_5_VF_EQC_IDX_M,
-				       VF_RES_A_DATA_5_VF_EQC_IDX_S, 0);
-			roce_set_field(req_a->vf_eqc_bt_idx_num,
-				       VF_RES_A_DATA_5_VF_EQC_NUM_M,
-				       VF_RES_A_DATA_5_VF_EQC_NUM_S,
-				       HNS_ROCE_VF_EQC_NUM);
-		} else {
-			roce_set_field(req_b->vf_smac_idx_num,
-				       VF_RES_B_DATA_1_VF_SMAC_IDX_M,
-				       VF_RES_B_DATA_1_VF_SMAC_IDX_S, 0);
-			roce_set_field(req_b->vf_smac_idx_num,
-				       VF_RES_B_DATA_1_VF_SMAC_NUM_M,
-				       VF_RES_B_DATA_1_VF_SMAC_NUM_S,
-				       HNS_ROCE_VF_SMAC_NUM);
-
-			roce_set_field(req_b->vf_sgid_idx_num,
-				       VF_RES_B_DATA_2_VF_SGID_IDX_M,
-				       VF_RES_B_DATA_2_VF_SGID_IDX_S, 0);
-			roce_set_field(req_b->vf_sgid_idx_num,
-				       VF_RES_B_DATA_2_VF_SGID_NUM_M,
-				       VF_RES_B_DATA_2_VF_SGID_NUM_S,
-				       HNS_ROCE_VF_SGID_NUM);
-
-			roce_set_field(req_b->vf_qid_idx_sl_num,
-				       VF_RES_B_DATA_3_VF_QID_IDX_M,
-				       VF_RES_B_DATA_3_VF_QID_IDX_S, 0);
-			roce_set_field(req_b->vf_qid_idx_sl_num,
-				       VF_RES_B_DATA_3_VF_SL_NUM_M,
-				       VF_RES_B_DATA_3_VF_SL_NUM_S,
-				       HNS_ROCE_VF_SL_NUM);
-
-			roce_set_field(req_b->vf_sccc_idx_num,
-				       VF_RES_B_DATA_4_VF_SCCC_BT_IDX_M,
-				       VF_RES_B_DATA_4_VF_SCCC_BT_IDX_S, 0);
-			roce_set_field(req_b->vf_sccc_idx_num,
-				       VF_RES_B_DATA_4_VF_SCCC_BT_NUM_M,
-				       VF_RES_B_DATA_4_VF_SCCC_BT_NUM_S,
-				       HNS_ROCE_VF_SCCC_BT_NUM);
-		}
 	}
 
+	roce_set_field(req_a->vf_qpc_bt_idx_num,
+		       VF_RES_A_DATA_1_VF_QPC_BT_IDX_M,
+		       VF_RES_A_DATA_1_VF_QPC_BT_IDX_S, 0);
+	roce_set_field(req_a->vf_qpc_bt_idx_num,
+		       VF_RES_A_DATA_1_VF_QPC_BT_NUM_M,
+		       VF_RES_A_DATA_1_VF_QPC_BT_NUM_S, HNS_ROCE_VF_QPC_BT_NUM);
+
+	roce_set_field(req_a->vf_srqc_bt_idx_num,
+		       VF_RES_A_DATA_2_VF_SRQC_BT_IDX_M,
+		       VF_RES_A_DATA_2_VF_SRQC_BT_IDX_S, 0);
+	roce_set_field(req_a->vf_srqc_bt_idx_num,
+		       VF_RES_A_DATA_2_VF_SRQC_BT_NUM_M,
+		       VF_RES_A_DATA_2_VF_SRQC_BT_NUM_S,
+		       HNS_ROCE_VF_SRQC_BT_NUM);
+
+	roce_set_field(req_a->vf_cqc_bt_idx_num,
+		       VF_RES_A_DATA_3_VF_CQC_BT_IDX_M,
+		       VF_RES_A_DATA_3_VF_CQC_BT_IDX_S, 0);
+	roce_set_field(req_a->vf_cqc_bt_idx_num,
+		       VF_RES_A_DATA_3_VF_CQC_BT_NUM_M,
+		       VF_RES_A_DATA_3_VF_CQC_BT_NUM_S, HNS_ROCE_VF_CQC_BT_NUM);
+
+	roce_set_field(req_a->vf_mpt_bt_idx_num,
+		       VF_RES_A_DATA_4_VF_MPT_BT_IDX_M,
+		       VF_RES_A_DATA_4_VF_MPT_BT_IDX_S, 0);
+	roce_set_field(req_a->vf_mpt_bt_idx_num,
+		       VF_RES_A_DATA_4_VF_MPT_BT_NUM_M,
+		       VF_RES_A_DATA_4_VF_MPT_BT_NUM_S, HNS_ROCE_VF_MPT_BT_NUM);
+
+	roce_set_field(req_a->vf_eqc_bt_idx_num, VF_RES_A_DATA_5_VF_EQC_IDX_M,
+		       VF_RES_A_DATA_5_VF_EQC_IDX_S, 0);
+	roce_set_field(req_a->vf_eqc_bt_idx_num, VF_RES_A_DATA_5_VF_EQC_NUM_M,
+		       VF_RES_A_DATA_5_VF_EQC_NUM_S, HNS_ROCE_VF_EQC_NUM);
+
+	roce_set_field(req_b->vf_smac_idx_num, VF_RES_B_DATA_1_VF_SMAC_IDX_M,
+		       VF_RES_B_DATA_1_VF_SMAC_IDX_S, 0);
+	roce_set_field(req_b->vf_smac_idx_num, VF_RES_B_DATA_1_VF_SMAC_NUM_M,
+		       VF_RES_B_DATA_1_VF_SMAC_NUM_S, HNS_ROCE_VF_SMAC_NUM);
+
+	roce_set_field(req_b->vf_sgid_idx_num, VF_RES_B_DATA_2_VF_SGID_IDX_M,
+		       VF_RES_B_DATA_2_VF_SGID_IDX_S, 0);
+	roce_set_field(req_b->vf_sgid_idx_num, VF_RES_B_DATA_2_VF_SGID_NUM_M,
+		       VF_RES_B_DATA_2_VF_SGID_NUM_S, HNS_ROCE_VF_SGID_NUM);
+
+	roce_set_field(req_b->vf_qid_idx_sl_num, VF_RES_B_DATA_3_VF_QID_IDX_M,
+		       VF_RES_B_DATA_3_VF_QID_IDX_S, 0);
+	roce_set_field(req_b->vf_qid_idx_sl_num, VF_RES_B_DATA_3_VF_SL_NUM_M,
+		       VF_RES_B_DATA_3_VF_SL_NUM_S, HNS_ROCE_VF_SL_NUM);
+
+	roce_set_field(req_b->vf_sccc_idx_num, VF_RES_B_DATA_4_VF_SCCC_BT_IDX_M,
+		       VF_RES_B_DATA_4_VF_SCCC_BT_IDX_S, 0);
+	roce_set_field(req_b->vf_sccc_idx_num, VF_RES_B_DATA_4_VF_SCCC_BT_NUM_M,
+		       VF_RES_B_DATA_4_VF_SCCC_BT_NUM_S,
+		       HNS_ROCE_VF_SCCC_BT_NUM);
+
 	return hns_roce_cmq_send(hr_dev, desc, 2);
 }
 
@@ -1568,6 +1777,440 @@
 	return hns_roce_cmq_send(hr_dev, &desc, 1);
 }
 
+static void set_default_caps(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_caps *caps = &hr_dev->caps;
+
+	caps->num_qps		= HNS_ROCE_V2_MAX_QP_NUM;
+	caps->max_wqes		= HNS_ROCE_V2_MAX_WQE_NUM;
+	caps->num_cqs		= HNS_ROCE_V2_MAX_CQ_NUM;
+	caps->num_srqs		= HNS_ROCE_V2_MAX_SRQ_NUM;
+	caps->min_cqes		= HNS_ROCE_MIN_CQE_NUM;
+	caps->max_cqes		= HNS_ROCE_V2_MAX_CQE_NUM;
+	caps->max_sq_sg		= HNS_ROCE_V2_MAX_SQ_SGE_NUM;
+	caps->max_extend_sg	= HNS_ROCE_V2_MAX_EXTEND_SGE_NUM;
+	caps->max_rq_sg		= HNS_ROCE_V2_MAX_RQ_SGE_NUM;
+	caps->max_sq_inline	= HNS_ROCE_V2_MAX_SQ_INLINE;
+	caps->num_uars		= HNS_ROCE_V2_UAR_NUM;
+	caps->phy_num_uars	= HNS_ROCE_V2_PHY_UAR_NUM;
+	caps->num_aeq_vectors	= HNS_ROCE_V2_AEQE_VEC_NUM;
+	caps->num_comp_vectors	= HNS_ROCE_V2_COMP_VEC_NUM;
+	caps->num_other_vectors = HNS_ROCE_V2_ABNORMAL_VEC_NUM;
+	caps->num_mtpts		= HNS_ROCE_V2_MAX_MTPT_NUM;
+	caps->num_mtt_segs	= HNS_ROCE_V2_MAX_MTT_SEGS;
+	caps->num_cqe_segs	= HNS_ROCE_V2_MAX_CQE_SEGS;
+	caps->num_srqwqe_segs	= HNS_ROCE_V2_MAX_SRQWQE_SEGS;
+	caps->num_idx_segs	= HNS_ROCE_V2_MAX_IDX_SEGS;
+	caps->num_pds		= HNS_ROCE_V2_MAX_PD_NUM;
+	caps->max_qp_init_rdma	= HNS_ROCE_V2_MAX_QP_INIT_RDMA;
+	caps->max_qp_dest_rdma	= HNS_ROCE_V2_MAX_QP_DEST_RDMA;
+	caps->max_sq_desc_sz	= HNS_ROCE_V2_MAX_SQ_DESC_SZ;
+	caps->max_rq_desc_sz	= HNS_ROCE_V2_MAX_RQ_DESC_SZ;
+	caps->max_srq_desc_sz	= HNS_ROCE_V2_MAX_SRQ_DESC_SZ;
+	caps->qpc_sz		= HNS_ROCE_V2_QPC_SZ;
+	caps->irrl_entry_sz	= HNS_ROCE_V2_IRRL_ENTRY_SZ;
+	caps->trrl_entry_sz	= HNS_ROCE_V2_EXT_ATOMIC_TRRL_ENTRY_SZ;
+	caps->cqc_entry_sz	= HNS_ROCE_V2_CQC_ENTRY_SZ;
+	caps->srqc_entry_sz	= HNS_ROCE_V2_SRQC_ENTRY_SZ;
+	caps->mtpt_entry_sz	= HNS_ROCE_V2_MTPT_ENTRY_SZ;
+	caps->mtt_entry_sz	= HNS_ROCE_V2_MTT_ENTRY_SZ;
+	caps->idx_entry_sz	= HNS_ROCE_V2_IDX_ENTRY_SZ;
+	caps->cqe_sz		= HNS_ROCE_V2_CQE_SIZE;
+	caps->page_size_cap	= HNS_ROCE_V2_PAGE_SIZE_SUPPORTED;
+	caps->reserved_lkey	= 0;
+	caps->reserved_pds	= 0;
+	caps->reserved_mrws	= 1;
+	caps->reserved_uars	= 0;
+	caps->reserved_cqs	= 0;
+	caps->reserved_srqs	= 0;
+	caps->reserved_qps	= HNS_ROCE_V2_RSV_QPS;
+
+	caps->qpc_ba_pg_sz	= 0;
+	caps->qpc_buf_pg_sz	= 0;
+	caps->qpc_hop_num	= HNS_ROCE_CONTEXT_HOP_NUM;
+	caps->srqc_ba_pg_sz	= 0;
+	caps->srqc_buf_pg_sz	= 0;
+	caps->srqc_hop_num	= HNS_ROCE_CONTEXT_HOP_NUM;
+	caps->cqc_ba_pg_sz	= 0;
+	caps->cqc_buf_pg_sz	= 0;
+	caps->cqc_hop_num	= HNS_ROCE_CONTEXT_HOP_NUM;
+	caps->mpt_ba_pg_sz	= 0;
+	caps->mpt_buf_pg_sz	= 0;
+	caps->mpt_hop_num	= HNS_ROCE_CONTEXT_HOP_NUM;
+	caps->mtt_ba_pg_sz	= 0;
+	caps->mtt_buf_pg_sz	= 0;
+	caps->mtt_hop_num	= HNS_ROCE_MTT_HOP_NUM;
+	caps->wqe_sq_hop_num	= HNS_ROCE_SQWQE_HOP_NUM;
+	caps->wqe_sge_hop_num	= HNS_ROCE_EXT_SGE_HOP_NUM;
+	caps->wqe_rq_hop_num	= HNS_ROCE_RQWQE_HOP_NUM;
+	caps->cqe_ba_pg_sz	= HNS_ROCE_BA_PG_SZ_SUPPORTED_256K;
+	caps->cqe_buf_pg_sz	= 0;
+	caps->cqe_hop_num	= HNS_ROCE_CQE_HOP_NUM;
+	caps->srqwqe_ba_pg_sz	= 0;
+	caps->srqwqe_buf_pg_sz	= 0;
+	caps->srqwqe_hop_num	= HNS_ROCE_SRQWQE_HOP_NUM;
+	caps->idx_ba_pg_sz	= 0;
+	caps->idx_buf_pg_sz	= 0;
+	caps->idx_hop_num	= HNS_ROCE_IDX_HOP_NUM;
+	caps->chunk_sz		= HNS_ROCE_V2_TABLE_CHUNK_SIZE;
+
+	caps->flags		= HNS_ROCE_CAP_FLAG_REREG_MR |
+				  HNS_ROCE_CAP_FLAG_ROCE_V1_V2 |
+				  HNS_ROCE_CAP_FLAG_RECORD_DB |
+				  HNS_ROCE_CAP_FLAG_SQ_RECORD_DB;
+
+	caps->pkey_table_len[0] = 1;
+	caps->gid_table_len[0]	= HNS_ROCE_V2_GID_INDEX_NUM;
+	caps->ceqe_depth	= HNS_ROCE_V2_COMP_EQE_NUM;
+	caps->aeqe_depth	= HNS_ROCE_V2_ASYNC_EQE_NUM;
+	caps->aeqe_size		= HNS_ROCE_AEQE_SIZE;
+	caps->ceqe_size		= HNS_ROCE_CEQE_SIZE;
+	caps->local_ca_ack_delay = 0;
+	caps->max_mtu = IB_MTU_4096;
+
+	caps->max_srq_wrs	= HNS_ROCE_V2_MAX_SRQ_WR;
+	caps->max_srq_sges	= HNS_ROCE_V2_MAX_SRQ_SGE;
+
+	caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC | HNS_ROCE_CAP_FLAG_MW |
+		       HNS_ROCE_CAP_FLAG_SRQ | HNS_ROCE_CAP_FLAG_FRMR |
+		       HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL;
+
+	caps->num_qpc_timer	  = HNS_ROCE_V2_MAX_QPC_TIMER_NUM;
+	caps->qpc_timer_entry_sz  = HNS_ROCE_V2_QPC_TIMER_ENTRY_SZ;
+	caps->qpc_timer_ba_pg_sz  = 0;
+	caps->qpc_timer_buf_pg_sz = 0;
+	caps->qpc_timer_hop_num   = HNS_ROCE_HOP_NUM_0;
+	caps->num_cqc_timer	  = HNS_ROCE_V2_MAX_CQC_TIMER_NUM;
+	caps->cqc_timer_entry_sz  = HNS_ROCE_V2_CQC_TIMER_ENTRY_SZ;
+	caps->cqc_timer_ba_pg_sz  = 0;
+	caps->cqc_timer_buf_pg_sz = 0;
+	caps->cqc_timer_hop_num   = HNS_ROCE_HOP_NUM_0;
+
+	caps->sccc_sz = HNS_ROCE_V2_SCCC_SZ;
+	caps->sccc_ba_pg_sz	  = 0;
+	caps->sccc_buf_pg_sz	  = 0;
+	caps->sccc_hop_num	  = HNS_ROCE_SCCC_HOP_NUM;
+
+	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) {
+		caps->aeqe_size = HNS_ROCE_V3_EQE_SIZE;
+		caps->ceqe_size = HNS_ROCE_V3_EQE_SIZE;
+		caps->cqe_sz = HNS_ROCE_V3_CQE_SIZE;
+		caps->qpc_sz = HNS_ROCE_V3_QPC_SZ;
+	}
+}
+
+static void calc_pg_sz(int obj_num, int obj_size, int hop_num, int ctx_bt_num,
+		       int *buf_page_size, int *bt_page_size, u32 hem_type)
+{
+	u64 obj_per_chunk;
+	u64 bt_chunk_size = PAGE_SIZE;
+	u64 buf_chunk_size = PAGE_SIZE;
+	u64 obj_per_chunk_default = buf_chunk_size / obj_size;
+
+	*buf_page_size = 0;
+	*bt_page_size = 0;
+
+	switch (hop_num) {
+	case 3:
+		obj_per_chunk = ctx_bt_num * (bt_chunk_size / BA_BYTE_LEN) *
+				(bt_chunk_size / BA_BYTE_LEN) *
+				(bt_chunk_size / BA_BYTE_LEN) *
+				 obj_per_chunk_default;
+		break;
+	case 2:
+		obj_per_chunk = ctx_bt_num * (bt_chunk_size / BA_BYTE_LEN) *
+				(bt_chunk_size / BA_BYTE_LEN) *
+				 obj_per_chunk_default;
+		break;
+	case 1:
+		obj_per_chunk = ctx_bt_num * (bt_chunk_size / BA_BYTE_LEN) *
+				obj_per_chunk_default;
+		break;
+	case HNS_ROCE_HOP_NUM_0:
+		obj_per_chunk = ctx_bt_num * obj_per_chunk_default;
+		break;
+	default:
+		pr_err("table %u not support hop_num = %u!\n", hem_type,
+		       hop_num);
+		return;
+	}
+
+	if (hem_type >= HEM_TYPE_MTT)
+		*bt_page_size = ilog2(DIV_ROUND_UP(obj_num, obj_per_chunk));
+	else
+		*buf_page_size = ilog2(DIV_ROUND_UP(obj_num, obj_per_chunk));
+}
+
+static int hns_roce_query_pf_caps(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_cmq_desc desc[HNS_ROCE_QUERY_PF_CAPS_CMD_NUM];
+	struct hns_roce_caps *caps = &hr_dev->caps;
+	struct hns_roce_query_pf_caps_a *resp_a;
+	struct hns_roce_query_pf_caps_b *resp_b;
+	struct hns_roce_query_pf_caps_c *resp_c;
+	struct hns_roce_query_pf_caps_d *resp_d;
+	struct hns_roce_query_pf_caps_e *resp_e;
+	int ctx_hop_num;
+	int pbl_hop_num;
+	int ret;
+	int i;
+
+	for (i = 0; i < HNS_ROCE_QUERY_PF_CAPS_CMD_NUM; i++) {
+		hns_roce_cmq_setup_basic_desc(&desc[i],
+					      HNS_ROCE_OPC_QUERY_PF_CAPS_NUM,
+					      true);
+		if (i < (HNS_ROCE_QUERY_PF_CAPS_CMD_NUM - 1))
+			desc[i].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
+		else
+			desc[i].flag &= ~cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
+	}
+
+	ret = hns_roce_cmq_send(hr_dev, desc, HNS_ROCE_QUERY_PF_CAPS_CMD_NUM);
+	if (ret)
+		return ret;
+
+	resp_a = (struct hns_roce_query_pf_caps_a *)desc[0].data;
+	resp_b = (struct hns_roce_query_pf_caps_b *)desc[1].data;
+	resp_c = (struct hns_roce_query_pf_caps_c *)desc[2].data;
+	resp_d = (struct hns_roce_query_pf_caps_d *)desc[3].data;
+	resp_e = (struct hns_roce_query_pf_caps_e *)desc[4].data;
+
+	caps->local_ca_ack_delay     = resp_a->local_ca_ack_delay;
+	caps->max_sq_sg		     = le16_to_cpu(resp_a->max_sq_sg);
+	caps->max_sq_inline	     = le16_to_cpu(resp_a->max_sq_inline);
+	caps->max_rq_sg		     = le16_to_cpu(resp_a->max_rq_sg);
+	caps->max_extend_sg	     = le32_to_cpu(resp_a->max_extend_sg);
+	caps->num_qpc_timer	     = le16_to_cpu(resp_a->num_qpc_timer);
+	caps->num_cqc_timer	     = le16_to_cpu(resp_a->num_cqc_timer);
+	caps->max_srq_sges	     = le16_to_cpu(resp_a->max_srq_sges);
+	caps->num_aeq_vectors	     = resp_a->num_aeq_vectors;
+	caps->num_other_vectors	     = resp_a->num_other_vectors;
+	caps->max_sq_desc_sz	     = resp_a->max_sq_desc_sz;
+	caps->max_rq_desc_sz	     = resp_a->max_rq_desc_sz;
+	caps->max_srq_desc_sz	     = resp_a->max_srq_desc_sz;
+	caps->cqe_sz		     = HNS_ROCE_V2_CQE_SIZE;
+
+	caps->mtpt_entry_sz	     = resp_b->mtpt_entry_sz;
+	caps->irrl_entry_sz	     = resp_b->irrl_entry_sz;
+	caps->trrl_entry_sz	     = resp_b->trrl_entry_sz;
+	caps->cqc_entry_sz	     = resp_b->cqc_entry_sz;
+	caps->srqc_entry_sz	     = resp_b->srqc_entry_sz;
+	caps->idx_entry_sz	     = resp_b->idx_entry_sz;
+	caps->sccc_sz		     = resp_b->sccc_sz;
+	caps->max_mtu		     = resp_b->max_mtu;
+	caps->qpc_sz		     = HNS_ROCE_V2_QPC_SZ;
+	caps->min_cqes		     = resp_b->min_cqes;
+	caps->min_wqes		     = resp_b->min_wqes;
+	caps->page_size_cap	     = le32_to_cpu(resp_b->page_size_cap);
+	caps->pkey_table_len[0]	     = resp_b->pkey_table_len;
+	caps->phy_num_uars	     = resp_b->phy_num_uars;
+	ctx_hop_num		     = resp_b->ctx_hop_num;
+	pbl_hop_num		     = resp_b->pbl_hop_num;
+
+	caps->num_pds = 1 << roce_get_field(resp_c->cap_flags_num_pds,
+					    V2_QUERY_PF_CAPS_C_NUM_PDS_M,
+					    V2_QUERY_PF_CAPS_C_NUM_PDS_S);
+	caps->flags = roce_get_field(resp_c->cap_flags_num_pds,
+				     V2_QUERY_PF_CAPS_C_CAP_FLAGS_M,
+				     V2_QUERY_PF_CAPS_C_CAP_FLAGS_S);
+	caps->flags |= le16_to_cpu(resp_d->cap_flags_ex) <<
+		       HNS_ROCE_CAP_FLAGS_EX_SHIFT;
+
+	caps->num_cqs = 1 << roce_get_field(resp_c->max_gid_num_cqs,
+					    V2_QUERY_PF_CAPS_C_NUM_CQS_M,
+					    V2_QUERY_PF_CAPS_C_NUM_CQS_S);
+	caps->gid_table_len[0] = roce_get_field(resp_c->max_gid_num_cqs,
+						V2_QUERY_PF_CAPS_C_MAX_GID_M,
+						V2_QUERY_PF_CAPS_C_MAX_GID_S);
+	caps->max_cqes = 1 << roce_get_field(resp_c->cq_depth,
+					     V2_QUERY_PF_CAPS_C_CQ_DEPTH_M,
+					     V2_QUERY_PF_CAPS_C_CQ_DEPTH_S);
+	caps->num_mtpts = 1 << roce_get_field(resp_c->num_mrws,
+					      V2_QUERY_PF_CAPS_C_NUM_MRWS_M,
+					      V2_QUERY_PF_CAPS_C_NUM_MRWS_S);
+	caps->num_qps = 1 << roce_get_field(resp_c->ord_num_qps,
+					    V2_QUERY_PF_CAPS_C_NUM_QPS_M,
+					    V2_QUERY_PF_CAPS_C_NUM_QPS_S);
+	caps->max_qp_init_rdma = roce_get_field(resp_c->ord_num_qps,
+						V2_QUERY_PF_CAPS_C_MAX_ORD_M,
+						V2_QUERY_PF_CAPS_C_MAX_ORD_S);
+	caps->max_qp_dest_rdma = caps->max_qp_init_rdma;
+	caps->max_wqes = 1 << le16_to_cpu(resp_c->sq_depth);
+	caps->num_srqs = 1 << roce_get_field(resp_d->wq_hop_num_max_srqs,
+					     V2_QUERY_PF_CAPS_D_NUM_SRQS_M,
+					     V2_QUERY_PF_CAPS_D_NUM_SRQS_S);
+	caps->max_srq_wrs = 1 << le16_to_cpu(resp_d->srq_depth);
+	caps->ceqe_depth = 1 << roce_get_field(resp_d->num_ceqs_ceq_depth,
+					       V2_QUERY_PF_CAPS_D_CEQ_DEPTH_M,
+					       V2_QUERY_PF_CAPS_D_CEQ_DEPTH_S);
+	caps->num_comp_vectors = roce_get_field(resp_d->num_ceqs_ceq_depth,
+						V2_QUERY_PF_CAPS_D_NUM_CEQS_M,
+						V2_QUERY_PF_CAPS_D_NUM_CEQS_S);
+	caps->aeqe_depth = 1 << roce_get_field(resp_d->arm_st_aeq_depth,
+					       V2_QUERY_PF_CAPS_D_AEQ_DEPTH_M,
+					       V2_QUERY_PF_CAPS_D_AEQ_DEPTH_S);
+	caps->default_aeq_arm_st = roce_get_field(resp_d->arm_st_aeq_depth,
+					    V2_QUERY_PF_CAPS_D_AEQ_ARM_ST_M,
+					    V2_QUERY_PF_CAPS_D_AEQ_ARM_ST_S);
+	caps->default_ceq_arm_st = roce_get_field(resp_d->arm_st_aeq_depth,
+					    V2_QUERY_PF_CAPS_D_CEQ_ARM_ST_M,
+					    V2_QUERY_PF_CAPS_D_CEQ_ARM_ST_S);
+	caps->reserved_pds = roce_get_field(resp_d->num_uars_rsv_pds,
+					    V2_QUERY_PF_CAPS_D_RSV_PDS_M,
+					    V2_QUERY_PF_CAPS_D_RSV_PDS_S);
+	caps->num_uars = 1 << roce_get_field(resp_d->num_uars_rsv_pds,
+					     V2_QUERY_PF_CAPS_D_NUM_UARS_M,
+					     V2_QUERY_PF_CAPS_D_NUM_UARS_S);
+	caps->reserved_qps = roce_get_field(resp_d->rsv_uars_rsv_qps,
+					    V2_QUERY_PF_CAPS_D_RSV_QPS_M,
+					    V2_QUERY_PF_CAPS_D_RSV_QPS_S);
+	caps->reserved_uars = roce_get_field(resp_d->rsv_uars_rsv_qps,
+					     V2_QUERY_PF_CAPS_D_RSV_UARS_M,
+					     V2_QUERY_PF_CAPS_D_RSV_UARS_S);
+	caps->reserved_mrws = roce_get_field(resp_e->chunk_size_shift_rsv_mrws,
+					     V2_QUERY_PF_CAPS_E_RSV_MRWS_M,
+					     V2_QUERY_PF_CAPS_E_RSV_MRWS_S);
+	caps->chunk_sz = 1 << roce_get_field(resp_e->chunk_size_shift_rsv_mrws,
+					 V2_QUERY_PF_CAPS_E_CHUNK_SIZE_SHIFT_M,
+					 V2_QUERY_PF_CAPS_E_CHUNK_SIZE_SHIFT_S);
+	caps->reserved_cqs = roce_get_field(resp_e->rsv_cqs,
+					    V2_QUERY_PF_CAPS_E_RSV_CQS_M,
+					    V2_QUERY_PF_CAPS_E_RSV_CQS_S);
+	caps->reserved_srqs = roce_get_field(resp_e->rsv_srqs,
+					     V2_QUERY_PF_CAPS_E_RSV_SRQS_M,
+					     V2_QUERY_PF_CAPS_E_RSV_SRQS_S);
+	caps->reserved_lkey = roce_get_field(resp_e->rsv_lkey,
+					     V2_QUERY_PF_CAPS_E_RSV_LKEYS_M,
+					     V2_QUERY_PF_CAPS_E_RSV_LKEYS_S);
+	caps->default_ceq_max_cnt = le16_to_cpu(resp_e->ceq_max_cnt);
+	caps->default_ceq_period = le16_to_cpu(resp_e->ceq_period);
+	caps->default_aeq_max_cnt = le16_to_cpu(resp_e->aeq_max_cnt);
+	caps->default_aeq_period = le16_to_cpu(resp_e->aeq_period);
+
+	caps->qpc_timer_entry_sz = HNS_ROCE_V2_QPC_TIMER_ENTRY_SZ;
+	caps->cqc_timer_entry_sz = HNS_ROCE_V2_CQC_TIMER_ENTRY_SZ;
+	caps->mtt_entry_sz = HNS_ROCE_V2_MTT_ENTRY_SZ;
+	caps->num_mtt_segs = HNS_ROCE_V2_MAX_MTT_SEGS;
+	caps->ceqe_size = HNS_ROCE_CEQE_SIZE;
+	caps->aeqe_size = HNS_ROCE_AEQE_SIZE;
+	caps->mtt_ba_pg_sz = 0;
+	caps->num_cqe_segs = HNS_ROCE_V2_MAX_CQE_SEGS;
+	caps->num_srqwqe_segs = HNS_ROCE_V2_MAX_SRQWQE_SEGS;
+	caps->num_idx_segs = HNS_ROCE_V2_MAX_IDX_SEGS;
+
+	caps->qpc_hop_num = ctx_hop_num;
+	caps->srqc_hop_num = ctx_hop_num;
+	caps->cqc_hop_num = ctx_hop_num;
+	caps->mpt_hop_num = ctx_hop_num;
+	caps->mtt_hop_num = pbl_hop_num;
+	caps->cqe_hop_num = pbl_hop_num;
+	caps->srqwqe_hop_num = pbl_hop_num;
+	caps->idx_hop_num = pbl_hop_num;
+	caps->wqe_sq_hop_num = roce_get_field(resp_d->wq_hop_num_max_srqs,
+					  V2_QUERY_PF_CAPS_D_SQWQE_HOP_NUM_M,
+					  V2_QUERY_PF_CAPS_D_SQWQE_HOP_NUM_S);
+	caps->wqe_sge_hop_num = roce_get_field(resp_d->wq_hop_num_max_srqs,
+					  V2_QUERY_PF_CAPS_D_EX_SGE_HOP_NUM_M,
+					  V2_QUERY_PF_CAPS_D_EX_SGE_HOP_NUM_S);
+	caps->wqe_rq_hop_num = roce_get_field(resp_d->wq_hop_num_max_srqs,
+					  V2_QUERY_PF_CAPS_D_RQWQE_HOP_NUM_M,
+					  V2_QUERY_PF_CAPS_D_RQWQE_HOP_NUM_S);
+
+	if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) {
+		caps->ceqe_size = HNS_ROCE_V3_EQE_SIZE;
+		caps->aeqe_size = HNS_ROCE_V3_EQE_SIZE;
+		caps->cqe_sz = HNS_ROCE_V3_CQE_SIZE;
+		caps->qpc_sz = HNS_ROCE_V3_QPC_SZ;
+		caps->sccc_sz = HNS_ROCE_V3_SCCC_SZ;
+	}
+
+	calc_pg_sz(caps->num_qps, caps->qpc_sz, caps->qpc_hop_num,
+		   caps->qpc_bt_num, &caps->qpc_buf_pg_sz, &caps->qpc_ba_pg_sz,
+		   HEM_TYPE_QPC);
+	calc_pg_sz(caps->num_mtpts, caps->mtpt_entry_sz, caps->mpt_hop_num,
+		   caps->mpt_bt_num, &caps->mpt_buf_pg_sz, &caps->mpt_ba_pg_sz,
+		   HEM_TYPE_MTPT);
+	calc_pg_sz(caps->num_cqs, caps->cqc_entry_sz, caps->cqc_hop_num,
+		   caps->cqc_bt_num, &caps->cqc_buf_pg_sz, &caps->cqc_ba_pg_sz,
+		   HEM_TYPE_CQC);
+	calc_pg_sz(caps->num_srqs, caps->srqc_entry_sz, caps->srqc_hop_num,
+		   caps->srqc_bt_num, &caps->srqc_buf_pg_sz,
+		   &caps->srqc_ba_pg_sz, HEM_TYPE_SRQC);
+
+	caps->sccc_hop_num = ctx_hop_num;
+	caps->qpc_timer_hop_num = HNS_ROCE_HOP_NUM_0;
+	caps->cqc_timer_hop_num = HNS_ROCE_HOP_NUM_0;
+
+	calc_pg_sz(caps->num_qps, caps->sccc_sz,
+		   caps->sccc_hop_num, caps->sccc_bt_num,
+		   &caps->sccc_buf_pg_sz, &caps->sccc_ba_pg_sz,
+		   HEM_TYPE_SCCC);
+	calc_pg_sz(caps->num_cqc_timer, caps->cqc_timer_entry_sz,
+		   caps->cqc_timer_hop_num, caps->cqc_timer_bt_num,
+		   &caps->cqc_timer_buf_pg_sz,
+		   &caps->cqc_timer_ba_pg_sz, HEM_TYPE_CQC_TIMER);
+
+	calc_pg_sz(caps->num_cqe_segs, caps->mtt_entry_sz, caps->cqe_hop_num,
+		   1, &caps->cqe_buf_pg_sz, &caps->cqe_ba_pg_sz, HEM_TYPE_CQE);
+	calc_pg_sz(caps->num_srqwqe_segs, caps->mtt_entry_sz,
+		   caps->srqwqe_hop_num, 1, &caps->srqwqe_buf_pg_sz,
+		   &caps->srqwqe_ba_pg_sz, HEM_TYPE_SRQWQE);
+	calc_pg_sz(caps->num_idx_segs, caps->idx_entry_sz, caps->idx_hop_num,
+		   1, &caps->idx_buf_pg_sz, &caps->idx_ba_pg_sz, HEM_TYPE_IDX);
+
+	return 0;
+}
+
+static int hns_roce_config_qpc_size(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_cmq_desc desc;
+	struct hns_roce_cfg_entry_size *cfg_size =
+				  (struct hns_roce_cfg_entry_size *)desc.data;
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_ENTRY_SIZE,
+				      false);
+
+	cfg_size->type = cpu_to_le32(HNS_ROCE_CFG_QPC_SIZE);
+	cfg_size->size = cpu_to_le32(hr_dev->caps.qpc_sz);
+
+	return hns_roce_cmq_send(hr_dev, &desc, 1);
+}
+
+static int hns_roce_config_sccc_size(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_cmq_desc desc;
+	struct hns_roce_cfg_entry_size *cfg_size =
+				  (struct hns_roce_cfg_entry_size *)desc.data;
+
+	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_ENTRY_SIZE,
+				      false);
+
+	cfg_size->type = cpu_to_le32(HNS_ROCE_CFG_SCCC_SIZE);
+	cfg_size->size = cpu_to_le32(hr_dev->caps.sccc_sz);
+
+	return hns_roce_cmq_send(hr_dev, &desc, 1);
+}
+
+static int hns_roce_config_entry_size(struct hns_roce_dev *hr_dev)
+{
+	int ret;
+
+	if (hr_dev->pci_dev->revision < PCI_REVISION_ID_HIP09)
+		return 0;
+
+	ret = hns_roce_config_qpc_size(hr_dev);
+	if (ret) {
+		dev_err(hr_dev->dev, "failed to cfg qpc sz, ret = %d.\n", ret);
+		return ret;
+	}
+
+	ret = hns_roce_config_sccc_size(hr_dev);
+	if (ret)
+		dev_err(hr_dev->dev, "failed to cfg sccc sz, ret = %d.\n", ret);
+
+	return ret;
+}
+
 static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
 {
 	struct hns_roce_caps *caps = &hr_dev->caps;
@@ -1602,16 +2245,36 @@
 		return ret;
 	}
 
-	if (hr_dev->pci_dev->revision == 0x21) {
-		ret = hns_roce_query_pf_timer_resource(hr_dev);
-		if (ret) {
-			dev_err(hr_dev->dev,
-				"Query pf timer resource fail, ret = %d.\n",
-				ret);
-			return ret;
-		}
+	ret = hns_roce_query_pf_timer_resource(hr_dev);
+	if (ret) {
+		dev_err(hr_dev->dev,
+			"failed to query pf timer resource, ret = %d.\n", ret);
+		return ret;
 	}
 
+	ret = hns_roce_set_vf_switch_param(hr_dev, 0);
+	if (ret) {
+		dev_err(hr_dev->dev,
+			"failed to set function switch param, ret = %d.\n",
+			ret);
+		return ret;
+	}
+
+	hr_dev->vendor_part_id = hr_dev->pci_dev->device;
+	hr_dev->sys_image_guid = be64_to_cpu(hr_dev->ib_dev.node_guid);
+
+	caps->pbl_ba_pg_sz	= HNS_ROCE_BA_PG_SZ_SUPPORTED_16K;
+	caps->pbl_buf_pg_sz	= 0;
+	caps->pbl_hop_num	= HNS_ROCE_PBL_HOP_NUM;
+	caps->eqe_ba_pg_sz	= 0;
+	caps->eqe_buf_pg_sz	= 0;
+	caps->eqe_hop_num	= HNS_ROCE_EQE_HOP_NUM;
+	caps->tsq_buf_pg_sz	= 0;
+
+	ret = hns_roce_query_pf_caps(hr_dev);
+	if (ret)
+		set_default_caps(hr_dev);
+
 	ret = hns_roce_alloc_vf_resource(hr_dev);
 	if (ret) {
 		dev_err(hr_dev->dev, "Allocate vf resource fail, ret = %d.\n",
@@ -1619,148 +2282,15 @@
 		return ret;
 	}
 
-	if (hr_dev->pci_dev->revision == 0x21) {
-		ret = hns_roce_set_vf_switch_param(hr_dev, 0);
-		if (ret) {
-			dev_err(hr_dev->dev,
-				"Set function switch param fail, ret = %d.\n",
-				ret);
-			return ret;
-		}
-	}
-
-	hr_dev->vendor_part_id = hr_dev->pci_dev->device;
-	hr_dev->sys_image_guid = be64_to_cpu(hr_dev->ib_dev.node_guid);
-
-	caps->num_qps		= HNS_ROCE_V2_MAX_QP_NUM;
-	caps->max_wqes		= HNS_ROCE_V2_MAX_WQE_NUM;
-	caps->num_cqs		= HNS_ROCE_V2_MAX_CQ_NUM;
-	caps->num_srqs		= HNS_ROCE_V2_MAX_SRQ_NUM;
-	caps->min_cqes		= HNS_ROCE_MIN_CQE_NUM;
-	caps->max_cqes		= HNS_ROCE_V2_MAX_CQE_NUM;
-	caps->max_srqwqes	= HNS_ROCE_V2_MAX_SRQWQE_NUM;
-	caps->max_sq_sg		= HNS_ROCE_V2_MAX_SQ_SGE_NUM;
-	caps->max_extend_sg	= HNS_ROCE_V2_MAX_EXTEND_SGE_NUM;
-	caps->max_rq_sg		= HNS_ROCE_V2_MAX_RQ_SGE_NUM;
-	caps->max_sq_inline	= HNS_ROCE_V2_MAX_SQ_INLINE;
-	caps->max_srq_sg	= HNS_ROCE_V2_MAX_SRQ_SGE_NUM;
-	caps->num_uars		= HNS_ROCE_V2_UAR_NUM;
-	caps->phy_num_uars	= HNS_ROCE_V2_PHY_UAR_NUM;
-	caps->num_aeq_vectors	= HNS_ROCE_V2_AEQE_VEC_NUM;
-	caps->num_comp_vectors	= HNS_ROCE_V2_COMP_VEC_NUM;
-	caps->num_other_vectors	= HNS_ROCE_V2_ABNORMAL_VEC_NUM;
-	caps->num_mtpts		= HNS_ROCE_V2_MAX_MTPT_NUM;
-	caps->num_mtt_segs	= HNS_ROCE_V2_MAX_MTT_SEGS;
-	caps->num_cqe_segs	= HNS_ROCE_V2_MAX_CQE_SEGS;
-	caps->num_srqwqe_segs	= HNS_ROCE_V2_MAX_SRQWQE_SEGS;
-	caps->num_idx_segs	= HNS_ROCE_V2_MAX_IDX_SEGS;
-	caps->num_pds		= HNS_ROCE_V2_MAX_PD_NUM;
-	caps->max_qp_init_rdma	= HNS_ROCE_V2_MAX_QP_INIT_RDMA;
-	caps->max_qp_dest_rdma	= HNS_ROCE_V2_MAX_QP_DEST_RDMA;
-	caps->max_sq_desc_sz	= HNS_ROCE_V2_MAX_SQ_DESC_SZ;
-	caps->max_rq_desc_sz	= HNS_ROCE_V2_MAX_RQ_DESC_SZ;
-	caps->max_srq_desc_sz	= HNS_ROCE_V2_MAX_SRQ_DESC_SZ;
-	caps->qpc_entry_sz	= HNS_ROCE_V2_QPC_ENTRY_SZ;
-	caps->irrl_entry_sz	= HNS_ROCE_V2_IRRL_ENTRY_SZ;
-	caps->trrl_entry_sz	= HNS_ROCE_V2_TRRL_ENTRY_SZ;
-	caps->cqc_entry_sz	= HNS_ROCE_V2_CQC_ENTRY_SZ;
-	caps->srqc_entry_sz	= HNS_ROCE_V2_SRQC_ENTRY_SZ;
-	caps->mtpt_entry_sz	= HNS_ROCE_V2_MTPT_ENTRY_SZ;
-	caps->mtt_entry_sz	= HNS_ROCE_V2_MTT_ENTRY_SZ;
-	caps->idx_entry_sz	= 4;
-	caps->cq_entry_sz	= HNS_ROCE_V2_CQE_ENTRY_SIZE;
-	caps->page_size_cap	= HNS_ROCE_V2_PAGE_SIZE_SUPPORTED;
-	caps->reserved_lkey	= 0;
-	caps->reserved_pds	= 0;
-	caps->reserved_mrws	= 1;
-	caps->reserved_uars	= 0;
-	caps->reserved_cqs	= 0;
-	caps->reserved_srqs	= 0;
-	caps->reserved_qps	= HNS_ROCE_V2_RSV_QPS;
-
-	caps->qpc_ba_pg_sz	= 0;
-	caps->qpc_buf_pg_sz	= 0;
-	caps->qpc_hop_num	= HNS_ROCE_CONTEXT_HOP_NUM;
-	caps->srqc_ba_pg_sz	= 0;
-	caps->srqc_buf_pg_sz	= 0;
-	caps->srqc_hop_num	= HNS_ROCE_CONTEXT_HOP_NUM;
-	caps->cqc_ba_pg_sz	= 0;
-	caps->cqc_buf_pg_sz	= 0;
-	caps->cqc_hop_num	= HNS_ROCE_CONTEXT_HOP_NUM;
-	caps->mpt_ba_pg_sz	= 0;
-	caps->mpt_buf_pg_sz	= 0;
-	caps->mpt_hop_num	= HNS_ROCE_CONTEXT_HOP_NUM;
-	caps->pbl_ba_pg_sz	= 2;
-	caps->pbl_buf_pg_sz	= 0;
-	caps->pbl_hop_num	= HNS_ROCE_PBL_HOP_NUM;
-	caps->mtt_ba_pg_sz	= 0;
-	caps->mtt_buf_pg_sz	= 0;
-	caps->mtt_hop_num	= HNS_ROCE_MTT_HOP_NUM;
-	caps->wqe_sq_hop_num	= 2;
-	caps->wqe_sge_hop_num	= 1;
-	caps->wqe_rq_hop_num	= 2;
-	caps->cqe_ba_pg_sz	= 6;
-	caps->cqe_buf_pg_sz	= 0;
-	caps->cqe_hop_num	= HNS_ROCE_CQE_HOP_NUM;
-	caps->srqwqe_ba_pg_sz	= 0;
-	caps->srqwqe_buf_pg_sz	= 0;
-	caps->srqwqe_hop_num	= HNS_ROCE_SRQWQE_HOP_NUM;
-	caps->idx_ba_pg_sz	= 0;
-	caps->idx_buf_pg_sz	= 0;
-	caps->idx_hop_num	= HNS_ROCE_IDX_HOP_NUM;
-	caps->eqe_ba_pg_sz	= 0;
-	caps->eqe_buf_pg_sz	= 0;
-	caps->eqe_hop_num	= HNS_ROCE_EQE_HOP_NUM;
-	caps->tsq_buf_pg_sz	= 0;
-	caps->chunk_sz		= HNS_ROCE_V2_TABLE_CHUNK_SIZE;
-
-	caps->flags		= HNS_ROCE_CAP_FLAG_REREG_MR |
-				  HNS_ROCE_CAP_FLAG_ROCE_V1_V2 |
-				  HNS_ROCE_CAP_FLAG_RQ_INLINE |
-				  HNS_ROCE_CAP_FLAG_RECORD_DB |
-				  HNS_ROCE_CAP_FLAG_SQ_RECORD_DB;
-
-	if (hr_dev->pci_dev->revision == 0x21)
-		caps->flags |= HNS_ROCE_CAP_FLAG_MW |
-			       HNS_ROCE_CAP_FLAG_FRMR;
-
-	caps->pkey_table_len[0] = 1;
-	caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM;
-	caps->ceqe_depth	= HNS_ROCE_V2_COMP_EQE_NUM;
-	caps->aeqe_depth	= HNS_ROCE_V2_ASYNC_EQE_NUM;
-	caps->local_ca_ack_delay = 0;
-	caps->max_mtu = IB_MTU_4096;
-
-	caps->max_srqs		= HNS_ROCE_V2_MAX_SRQ;
-	caps->max_srq_wrs	= HNS_ROCE_V2_MAX_SRQ_WR;
-	caps->max_srq_sges	= HNS_ROCE_V2_MAX_SRQ_SGE;
-
-	if (hr_dev->pci_dev->revision == 0x21) {
-		caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC |
-			       HNS_ROCE_CAP_FLAG_SRQ |
-			       HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL;
-
-		caps->num_qpc_timer	  = HNS_ROCE_V2_MAX_QPC_TIMER_NUM;
-		caps->qpc_timer_entry_sz  = HNS_ROCE_V2_QPC_TIMER_ENTRY_SZ;
-		caps->qpc_timer_ba_pg_sz  = 0;
-		caps->qpc_timer_buf_pg_sz = 0;
-		caps->qpc_timer_hop_num   = HNS_ROCE_HOP_NUM_0;
-		caps->num_cqc_timer	  = HNS_ROCE_V2_MAX_CQC_TIMER_NUM;
-		caps->cqc_timer_entry_sz  = HNS_ROCE_V2_CQC_TIMER_ENTRY_SZ;
-		caps->cqc_timer_ba_pg_sz  = 0;
-		caps->cqc_timer_buf_pg_sz = 0;
-		caps->cqc_timer_hop_num   = HNS_ROCE_HOP_NUM_0;
-
-		caps->sccc_entry_sz	= HNS_ROCE_V2_SCCC_ENTRY_SZ;
-		caps->sccc_ba_pg_sz	= 0;
-		caps->sccc_buf_pg_sz    = 0;
-		caps->sccc_hop_num	= HNS_ROCE_SCCC_HOP_NUM;
-	}
-
 	ret = hns_roce_v2_set_bt(hr_dev);
-	if (ret)
-		dev_err(hr_dev->dev, "Configure bt attribute fail, ret = %d.\n",
-			ret);
+	if (ret) {
+		dev_err(hr_dev->dev,
+			"Configure bt attribute fail, ret = %d.\n", ret);
+		return ret;
+	}
+
+	/* Configure the size of QPC, SCCC, etc. */
+	ret = hns_roce_config_entry_size(hr_dev);
 
 	return ret;
 }
@@ -1795,8 +2325,6 @@
 
 	page_num = link_tbl->npages;
 	entry = link_tbl->table.buf;
-	memset(req_a, 0, sizeof(*req_a));
-	memset(req_b, 0, sizeof(*req_b));
 
 	for (i = 0; i < 2; i++) {
 		hns_roce_cmq_setup_basic_desc(&desc[i], opcode, false);
@@ -1805,44 +2333,30 @@
 			desc[i].flag |= cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
 		else
 			desc[i].flag &= ~cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT);
-
-		if (i == 0) {
-			req_a->base_addr_l =
-				cpu_to_le32(link_tbl->table.map & 0xffffffff);
-			req_a->base_addr_h =
-				cpu_to_le32(link_tbl->table.map >> 32);
-			roce_set_field(req_a->depth_pgsz_init_en,
-				       CFG_LLM_QUE_DEPTH_M,
-				       CFG_LLM_QUE_DEPTH_S,
-				       link_tbl->npages);
-			roce_set_field(req_a->depth_pgsz_init_en,
-				       CFG_LLM_QUE_PGSZ_M,
-				       CFG_LLM_QUE_PGSZ_S,
-				       link_tbl->pg_sz);
-			req_a->head_ba_l = cpu_to_le32(entry[0].blk_ba0);
-			req_a->head_ba_h_nxtptr =
-				cpu_to_le32(entry[0].blk_ba1_nxt_ptr);
-			roce_set_field(req_a->head_ptr,
-				       CFG_LLM_HEAD_PTR_M,
-				       CFG_LLM_HEAD_PTR_S, 0);
-		} else {
-			req_b->tail_ba_l =
-				cpu_to_le32(entry[page_num - 1].blk_ba0);
-			roce_set_field(req_b->tail_ba_h,
-				       CFG_LLM_TAIL_BA_H_M,
-				       CFG_LLM_TAIL_BA_H_S,
-				       entry[page_num - 1].blk_ba1_nxt_ptr &
-				       HNS_ROCE_LINK_TABLE_BA1_M);
-			roce_set_field(req_b->tail_ptr,
-				       CFG_LLM_TAIL_PTR_M,
-				       CFG_LLM_TAIL_PTR_S,
-				       (entry[page_num - 2].blk_ba1_nxt_ptr &
-				       HNS_ROCE_LINK_TABLE_NXT_PTR_M) >>
-				       HNS_ROCE_LINK_TABLE_NXT_PTR_S);
-		}
 	}
-	roce_set_field(req_a->depth_pgsz_init_en,
-		       CFG_LLM_INIT_EN_M, CFG_LLM_INIT_EN_S, 1);
+
+	req_a->base_addr_l = cpu_to_le32(link_tbl->table.map & 0xffffffff);
+	req_a->base_addr_h = cpu_to_le32(link_tbl->table.map >> 32);
+	roce_set_field(req_a->depth_pgsz_init_en, CFG_LLM_QUE_DEPTH_M,
+		       CFG_LLM_QUE_DEPTH_S, link_tbl->npages);
+	roce_set_field(req_a->depth_pgsz_init_en, CFG_LLM_QUE_PGSZ_M,
+		       CFG_LLM_QUE_PGSZ_S, link_tbl->pg_sz);
+	roce_set_field(req_a->depth_pgsz_init_en, CFG_LLM_INIT_EN_M,
+		       CFG_LLM_INIT_EN_S, 1);
+	req_a->head_ba_l = cpu_to_le32(entry[0].blk_ba0);
+	req_a->head_ba_h_nxtptr = cpu_to_le32(entry[0].blk_ba1_nxt_ptr);
+	roce_set_field(req_a->head_ptr, CFG_LLM_HEAD_PTR_M, CFG_LLM_HEAD_PTR_S,
+		       0);
+
+	req_b->tail_ba_l = cpu_to_le32(entry[page_num - 1].blk_ba0);
+	roce_set_field(req_b->tail_ba_h, CFG_LLM_TAIL_BA_H_M,
+		       CFG_LLM_TAIL_BA_H_S,
+		       entry[page_num - 1].blk_ba1_nxt_ptr &
+		       HNS_ROCE_LINK_TABLE_BA1_M);
+	roce_set_field(req_b->tail_ptr, CFG_LLM_TAIL_PTR_M, CFG_LLM_TAIL_PTR_S,
+		       (entry[page_num - 2].blk_ba1_nxt_ptr &
+			HNS_ROCE_LINK_TABLE_NXT_PTR_M) >>
+			HNS_ROCE_LINK_TABLE_NXT_PTR_S);
 
 	return hns_roce_cmq_send(hr_dev, desc, 2);
 }
@@ -2014,8 +2528,7 @@
 {
 	struct hns_roce_v2_priv *priv = hr_dev->priv;
 
-	if (hr_dev->pci_dev->revision == 0x21)
-		hns_roce_function_clear(hr_dev);
+	hns_roce_function_clear(hr_dev);
 
 	hns_roce_free_link_table(hr_dev, &priv->tpq);
 	hns_roce_free_link_table(hr_dev, &priv->tsq);
@@ -2135,11 +2648,9 @@
 
 	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_SGID_TB, false);
 
-	roce_set_field(sgid_tb->table_idx_rsv,
-		       CFG_SGID_TB_TABLE_IDX_M,
+	roce_set_field(sgid_tb->table_idx_rsv, CFG_SGID_TB_TABLE_IDX_M,
 		       CFG_SGID_TB_TABLE_IDX_S, gid_index);
-	roce_set_field(sgid_tb->vf_sgid_type_rsv,
-		       CFG_SGID_TB_VF_SGID_TYPE_M,
+	roce_set_field(sgid_tb->vf_sgid_type_rsv, CFG_SGID_TB_VF_SGID_TYPE_M,
 		       CFG_SGID_TB_VF_SGID_TYPE_S, sgid_type);
 
 	p = (u32 *)&gid->raw[0];
@@ -2179,7 +2690,9 @@
 
 	ret = hns_roce_config_sgid_table(hr_dev, gid_index, gid, sgid_type);
 	if (ret)
-		dev_err(hr_dev->dev, "Configure sgid table failed(%d)!\n", ret);
+		ibdev_err(&hr_dev->ib_dev,
+			  "failed to configure sgid table, ret = %d!\n",
+			  ret);
 
 	return ret;
 }
@@ -2198,47 +2711,42 @@
 	reg_smac_l = *(u32 *)(&addr[0]);
 	reg_smac_h = *(u16 *)(&addr[4]);
 
-	memset(smac_tb, 0, sizeof(*smac_tb));
-	roce_set_field(smac_tb->tb_idx_rsv,
-		       CFG_SMAC_TB_IDX_M,
+	roce_set_field(smac_tb->tb_idx_rsv, CFG_SMAC_TB_IDX_M,
 		       CFG_SMAC_TB_IDX_S, phy_port);
-	roce_set_field(smac_tb->vf_smac_h_rsv,
-		       CFG_SMAC_TB_VF_SMAC_H_M,
+	roce_set_field(smac_tb->vf_smac_h_rsv, CFG_SMAC_TB_VF_SMAC_H_M,
 		       CFG_SMAC_TB_VF_SMAC_H_S, reg_smac_h);
 	smac_tb->vf_smac_l = cpu_to_le32(reg_smac_l);
 
 	return hns_roce_cmq_send(hr_dev, &desc, 1);
 }
 
-static int set_mtpt_pbl(struct hns_roce_v2_mpt_entry *mpt_entry,
+static int set_mtpt_pbl(struct hns_roce_dev *hr_dev,
+			struct hns_roce_v2_mpt_entry *mpt_entry,
 			struct hns_roce_mr *mr)
 {
-	struct sg_dma_page_iter sg_iter;
-	u64 page_addr;
-	u64 *pages;
-	int i;
+	u64 pages[HNS_ROCE_V2_MAX_INNER_MTPT_NUM] = { 0 };
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	dma_addr_t pbl_ba;
+	int i, count;
 
-	mpt_entry->pbl_size = cpu_to_le32(mr->pbl_size);
-	mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(mr->pbl_ba >> 3));
+	count = hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, pages,
+				  ARRAY_SIZE(pages), &pbl_ba);
+	if (count < 1) {
+		ibdev_err(ibdev, "failed to find PBL mtr, count = %d.\n",
+			  count);
+		return -ENOBUFS;
+	}
+
+	/* Aligned to the hardware address access unit */
+	for (i = 0; i < count; i++)
+		pages[i] >>= 6;
+
+	mpt_entry->pbl_size = cpu_to_le32(mr->npages);
+	mpt_entry->pbl_ba_l = cpu_to_le32(pbl_ba >> 3);
 	roce_set_field(mpt_entry->byte_48_mode_ba,
 		       V2_MPT_BYTE_48_PBL_BA_H_M, V2_MPT_BYTE_48_PBL_BA_H_S,
-		       upper_32_bits(mr->pbl_ba >> 3));
+		       upper_32_bits(pbl_ba >> 3));
 
-	pages = (u64 *)__get_free_page(GFP_KERNEL);
-	if (!pages)
-		return -ENOMEM;
-
-	i = 0;
-	for_each_sg_dma_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) {
-		page_addr = sg_page_iter_dma_address(&sg_iter);
-		pages[i] = page_addr >> 6;
-
-		/* Record the first 2 entry directly to MTPT table */
-		if (i >= HNS_ROCE_V2_MAX_INNER_MTPT_NUM - 1)
-			goto found;
-		i++;
-	}
-found:
 	mpt_entry->pa0_l = cpu_to_le32(lower_32_bits(pages[0]));
 	roce_set_field(mpt_entry->byte_56_pa0_h, V2_MPT_BYTE_56_PA0_H_M,
 		       V2_MPT_BYTE_56_PA0_H_S, upper_32_bits(pages[0]));
@@ -2249,14 +2757,13 @@
 	roce_set_field(mpt_entry->byte_64_buf_pa1,
 		       V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M,
 		       V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S,
-		       mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET);
-
-	free_page((unsigned long)pages);
+		       to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.buf_pg_shift));
 
 	return 0;
 }
 
-static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr,
+static int hns_roce_v2_write_mtpt(struct hns_roce_dev *hr_dev,
+				  void *mb_buf, struct hns_roce_mr *mr,
 				  unsigned long mtpt_idx)
 {
 	struct hns_roce_v2_mpt_entry *mpt_entry;
@@ -2273,7 +2780,7 @@
 	roce_set_field(mpt_entry->byte_4_pd_hop_st,
 		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_M,
 		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_S,
-		       mr->pbl_ba_pg_sz + PG_SHIFT_OFFSET);
+		       to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.ba_pg_shift));
 	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M,
 		       V2_MPT_BYTE_4_PD_S, mr->pd);
 
@@ -2305,7 +2812,7 @@
 	if (mr->type == MR_TYPE_DMA)
 		return 0;
 
-	ret = set_mtpt_pbl(mpt_entry, mr);
+	ret = set_mtpt_pbl(hr_dev, mpt_entry, mr);
 
 	return ret;
 }
@@ -2351,19 +2858,27 @@
 		mr->iova = iova;
 		mr->size = size;
 
-		ret = set_mtpt_pbl(mpt_entry, mr);
+		ret = set_mtpt_pbl(hr_dev, mpt_entry, mr);
 	}
 
 	return ret;
 }
 
-static int hns_roce_v2_frmr_write_mtpt(void *mb_buf, struct hns_roce_mr *mr)
+static int hns_roce_v2_frmr_write_mtpt(struct hns_roce_dev *hr_dev,
+				       void *mb_buf, struct hns_roce_mr *mr)
 {
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_v2_mpt_entry *mpt_entry;
+	dma_addr_t pbl_ba = 0;
 
 	mpt_entry = mb_buf;
 	memset(mpt_entry, 0, sizeof(*mpt_entry));
 
+	if (hns_roce_mtr_find(hr_dev, &mr->pbl_mtr, 0, NULL, 0, &pbl_ba) < 0) {
+		ibdev_err(ibdev, "failed to find frmr mtr.\n");
+		return -ENOBUFS;
+	}
+
 	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M,
 		       V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_FREE);
 	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PBL_HOP_NUM_M,
@@ -2371,7 +2886,7 @@
 	roce_set_field(mpt_entry->byte_4_pd_hop_st,
 		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_M,
 		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_S,
-		       mr->pbl_ba_pg_sz + PG_SHIFT_OFFSET);
+		       to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.ba_pg_shift));
 	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M,
 		       V2_MPT_BYTE_4_PD_S, mr->pd);
 
@@ -2384,17 +2899,17 @@
 	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_MR_MW_S, 0);
 	roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BPD_S, 1);
 
-	mpt_entry->pbl_size = cpu_to_le32(mr->pbl_size);
+	mpt_entry->pbl_size = cpu_to_le32(mr->npages);
 
-	mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(mr->pbl_ba >> 3));
+	mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(pbl_ba >> 3));
 	roce_set_field(mpt_entry->byte_48_mode_ba, V2_MPT_BYTE_48_PBL_BA_H_M,
 		       V2_MPT_BYTE_48_PBL_BA_H_S,
-		       upper_32_bits(mr->pbl_ba >> 3));
+		       upper_32_bits(pbl_ba >> 3));
 
 	roce_set_field(mpt_entry->byte_64_buf_pa1,
 		       V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M,
 		       V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S,
-		       mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET);
+		       to_hr_hw_page_shift(mr->pbl_mtr.hem_cfg.buf_pg_shift));
 
 	return 0;
 }
@@ -2410,11 +2925,10 @@
 		       V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_FREE);
 	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M,
 		       V2_MPT_BYTE_4_PD_S, mw->pdn);
-	roce_set_field(mpt_entry->byte_4_pd_hop_st,
-		       V2_MPT_BYTE_4_PBL_HOP_NUM_M,
+	roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PBL_HOP_NUM_M,
 		       V2_MPT_BYTE_4_PBL_HOP_NUM_S,
-		       mw->pbl_hop_num == HNS_ROCE_HOP_NUM_0 ?
-		       0 : mw->pbl_hop_num);
+		       mw->pbl_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 :
+							       mw->pbl_hop_num);
 	roce_set_field(mpt_entry->byte_4_pd_hop_st,
 		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_M,
 		       V2_MPT_BYTE_4_PBL_BA_PG_SZ_S,
@@ -2442,8 +2956,7 @@
 
 static void *get_cqe_v2(struct hns_roce_cq *hr_cq, int n)
 {
-	return hns_roce_buf_offset(&hr_cq->hr_buf.hr_buf,
-				   n * HNS_ROCE_V2_CQE_ENTRY_SIZE);
+	return hns_roce_buf_offset(hr_cq->mtr.kmem, n * hr_cq->cqe_size);
 }
 
 static void *get_sw_cqe_v2(struct hns_roce_cq *hr_cq, int n)
@@ -2452,33 +2965,12 @@
 
 	/* Get cqe when Owner bit is Conversely with the MSB of cons_idx */
 	return (roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_OWNER_S) ^
-		!!(n & (hr_cq->ib_cq.cqe + 1))) ? cqe : NULL;
+		!!(n & hr_cq->cq_depth)) ? cqe : NULL;
 }
 
-static struct hns_roce_v2_cqe *next_cqe_sw_v2(struct hns_roce_cq *hr_cq)
+static inline void hns_roce_v2_cq_set_ci(struct hns_roce_cq *hr_cq, u32 ci)
 {
-	return get_sw_cqe_v2(hr_cq, hr_cq->cons_index);
-}
-
-static void *get_srq_wqe(struct hns_roce_srq *srq, int n)
-{
-	return hns_roce_buf_offset(&srq->buf, n << srq->wqe_shift);
-}
-
-static void hns_roce_free_srq_wqe(struct hns_roce_srq *srq, int wqe_index)
-{
-	/* always called with interrupts disabled. */
-	spin_lock(&srq->lock);
-
-	bitmap_clear(srq->idx_que.bitmap, wqe_index, 1);
-	srq->tail++;
-
-	spin_unlock(&srq->lock);
-}
-
-static void hns_roce_v2_cq_set_ci(struct hns_roce_cq *hr_cq, u32 cons_index)
-{
-	*hr_cq->set_ci_db = cons_index & 0xffffff;
+	*hr_cq->set_ci_db = ci & V2_CQ_DB_PARAMETER_CONS_IDX_M;
 }
 
 static void __hns_roce_v2_cq_clean(struct hns_roce_cq *hr_cq, u32 qpn,
@@ -2545,8 +3037,7 @@
 
 static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev,
 				  struct hns_roce_cq *hr_cq, void *mb_buf,
-				  u64 *mtts, dma_addr_t dma_handle, int nent,
-				  u32 vector)
+				  u64 *mtts, dma_addr_t dma_handle)
 {
 	struct hns_roce_v2_cq_context *cq_context;
 
@@ -2558,46 +3049,50 @@
 	roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_ARM_ST_M,
 		       V2_CQC_BYTE_4_ARM_ST_S, REG_NXT_CEQE);
 	roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_SHIFT_M,
-		       V2_CQC_BYTE_4_SHIFT_S, ilog2((unsigned int)nent));
+		       V2_CQC_BYTE_4_SHIFT_S, ilog2(hr_cq->cq_depth));
 	roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_CEQN_M,
-		       V2_CQC_BYTE_4_CEQN_S, vector);
+		       V2_CQC_BYTE_4_CEQN_S, hr_cq->vector);
 
 	roce_set_field(cq_context->byte_8_cqn, V2_CQC_BYTE_8_CQN_M,
 		       V2_CQC_BYTE_8_CQN_S, hr_cq->cqn);
 
-	cq_context->cqe_cur_blk_addr = cpu_to_le32(mtts[0] >> PAGE_ADDR_SHIFT);
+	roce_set_field(cq_context->byte_8_cqn, V2_CQC_BYTE_8_CQE_SIZE_M,
+		       V2_CQC_BYTE_8_CQE_SIZE_S, hr_cq->cqe_size ==
+		       HNS_ROCE_V3_CQE_SIZE ? 1 : 0);
+
+	cq_context->cqe_cur_blk_addr = cpu_to_le32(to_hr_hw_page_addr(mtts[0]));
 
 	roce_set_field(cq_context->byte_16_hop_addr,
 		       V2_CQC_BYTE_16_CQE_CUR_BLK_ADDR_M,
 		       V2_CQC_BYTE_16_CQE_CUR_BLK_ADDR_S,
-		       mtts[0] >> (32 + PAGE_ADDR_SHIFT));
+		       upper_32_bits(to_hr_hw_page_addr(mtts[0])));
 	roce_set_field(cq_context->byte_16_hop_addr,
 		       V2_CQC_BYTE_16_CQE_HOP_NUM_M,
 		       V2_CQC_BYTE_16_CQE_HOP_NUM_S, hr_dev->caps.cqe_hop_num ==
 		       HNS_ROCE_HOP_NUM_0 ? 0 : hr_dev->caps.cqe_hop_num);
 
-	cq_context->cqe_nxt_blk_addr = cpu_to_le32(mtts[1] >> PAGE_ADDR_SHIFT);
+	cq_context->cqe_nxt_blk_addr = cpu_to_le32(to_hr_hw_page_addr(mtts[1]));
 	roce_set_field(cq_context->byte_24_pgsz_addr,
 		       V2_CQC_BYTE_24_CQE_NXT_BLK_ADDR_M,
 		       V2_CQC_BYTE_24_CQE_NXT_BLK_ADDR_S,
-		       mtts[1] >> (32 + PAGE_ADDR_SHIFT));
+		       upper_32_bits(to_hr_hw_page_addr(mtts[1])));
 	roce_set_field(cq_context->byte_24_pgsz_addr,
 		       V2_CQC_BYTE_24_CQE_BA_PG_SZ_M,
 		       V2_CQC_BYTE_24_CQE_BA_PG_SZ_S,
-		       hr_dev->caps.cqe_ba_pg_sz + PG_SHIFT_OFFSET);
+		       to_hr_hw_page_shift(hr_cq->mtr.hem_cfg.ba_pg_shift));
 	roce_set_field(cq_context->byte_24_pgsz_addr,
 		       V2_CQC_BYTE_24_CQE_BUF_PG_SZ_M,
 		       V2_CQC_BYTE_24_CQE_BUF_PG_SZ_S,
-		       hr_dev->caps.cqe_buf_pg_sz + PG_SHIFT_OFFSET);
+		       to_hr_hw_page_shift(hr_cq->mtr.hem_cfg.buf_pg_shift));
 
 	cq_context->cqe_ba = cpu_to_le32(dma_handle >> 3);
 
 	roce_set_field(cq_context->byte_40_cqe_ba, V2_CQC_BYTE_40_CQE_BA_M,
 		       V2_CQC_BYTE_40_CQE_BA_S, (dma_handle >> (32 + 3)));
 
-	if (hr_cq->db_en)
-		roce_set_bit(cq_context->byte_44_db_record,
-			     V2_CQC_BYTE_44_DB_RECORD_EN_S, 1);
+	roce_set_bit(cq_context->byte_44_db_record,
+		     V2_CQC_BYTE_44_DB_RECORD_EN_S,
+		     (hr_cq->flags & HNS_ROCE_CQ_FLAG_RECORD_DB) ? 1 : 0);
 
 	roce_set_field(cq_context->byte_44_db_record,
 		       V2_CQC_BYTE_44_DB_RECORD_ADDR_M,
@@ -2637,8 +3132,7 @@
 	roce_set_field(doorbell[0], V2_CQ_DB_BYTE_4_CMD_M, V2_DB_BYTE_4_CMD_S,
 		       HNS_ROCE_V2_CQ_DB_NTR);
 	roce_set_field(doorbell[1], V2_CQ_DB_PARAMETER_CONS_IDX_M,
-		       V2_CQ_DB_PARAMETER_CONS_IDX_S,
-		       hr_cq->cons_index & ((hr_cq->cq_depth << 1) - 1));
+		       V2_CQ_DB_PARAMETER_CONS_IDX_S, hr_cq->cons_index);
 	roce_set_field(doorbell[1], V2_CQ_DB_PARAMETER_CMD_SN_M,
 		       V2_CQ_DB_PARAMETER_CMD_SN_S, hr_cq->arm_sn & 0x3);
 	roce_set_bit(doorbell[1], V2_CQ_DB_PARAMETER_NOTIFY_S,
@@ -2664,7 +3158,7 @@
 
 	sge_list = (*cur_qp)->rq_inl_buf.wqe_list[wr_cnt].sg_list;
 	sge_num = (*cur_qp)->rq_inl_buf.wqe_list[wr_cnt].sge_cnt;
-	wqe_buf = get_recv_wqe(*cur_qp, wr_cnt);
+	wqe_buf = hns_roce_get_recv_wqe(*cur_qp, wr_cnt);
 	data_len = wc->byte_len;
 
 	for (sge_cnt = 0; (sge_cnt < sge_num) && (data_len); sge_cnt++) {
@@ -2675,7 +3169,7 @@
 		wqe_buf += size;
 	}
 
-	if (data_len) {
+	if (unlikely(data_len)) {
 		wc->status = IB_WC_LOC_LEN_ERR;
 		return -EAGAIN;
 	}
@@ -2683,25 +3177,137 @@
 	return 0;
 }
 
+static int sw_comp(struct hns_roce_qp *hr_qp, struct hns_roce_wq *wq,
+		   int num_entries, struct ib_wc *wc)
+{
+	unsigned int left;
+	int npolled = 0;
+
+	left = wq->head - wq->tail;
+	if (left == 0)
+		return 0;
+
+	left = min_t(unsigned int, (unsigned int)num_entries, left);
+	while (npolled < left) {
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		wc->vendor_err = 0;
+		wc->qp = &hr_qp->ibqp;
+
+		wq->tail++;
+		wc++;
+		npolled++;
+	}
+
+	return npolled;
+}
+
+static int hns_roce_v2_sw_poll_cq(struct hns_roce_cq *hr_cq, int num_entries,
+				  struct ib_wc *wc)
+{
+	struct hns_roce_qp *hr_qp;
+	int npolled = 0;
+
+	list_for_each_entry(hr_qp, &hr_cq->sq_list, sq_node) {
+		npolled += sw_comp(hr_qp, &hr_qp->sq,
+				   num_entries - npolled, wc + npolled);
+		if (npolled >= num_entries)
+			goto out;
+	}
+
+	list_for_each_entry(hr_qp, &hr_cq->rq_list, rq_node) {
+		npolled += sw_comp(hr_qp, &hr_qp->rq,
+				   num_entries - npolled, wc + npolled);
+		if (npolled >= num_entries)
+			goto out;
+	}
+
+out:
+	return npolled;
+}
+
+static void get_cqe_status(struct hns_roce_dev *hr_dev, struct hns_roce_qp *qp,
+			   struct hns_roce_cq *cq, struct hns_roce_v2_cqe *cqe,
+			   struct ib_wc *wc)
+{
+	static const struct {
+		u32 cqe_status;
+		enum ib_wc_status wc_status;
+	} map[] = {
+		{ HNS_ROCE_CQE_V2_SUCCESS, IB_WC_SUCCESS },
+		{ HNS_ROCE_CQE_V2_LOCAL_LENGTH_ERR, IB_WC_LOC_LEN_ERR },
+		{ HNS_ROCE_CQE_V2_LOCAL_QP_OP_ERR, IB_WC_LOC_QP_OP_ERR },
+		{ HNS_ROCE_CQE_V2_LOCAL_PROT_ERR, IB_WC_LOC_PROT_ERR },
+		{ HNS_ROCE_CQE_V2_WR_FLUSH_ERR, IB_WC_WR_FLUSH_ERR },
+		{ HNS_ROCE_CQE_V2_MW_BIND_ERR, IB_WC_MW_BIND_ERR },
+		{ HNS_ROCE_CQE_V2_BAD_RESP_ERR, IB_WC_BAD_RESP_ERR },
+		{ HNS_ROCE_CQE_V2_LOCAL_ACCESS_ERR, IB_WC_LOC_ACCESS_ERR },
+		{ HNS_ROCE_CQE_V2_REMOTE_INVAL_REQ_ERR, IB_WC_REM_INV_REQ_ERR },
+		{ HNS_ROCE_CQE_V2_REMOTE_ACCESS_ERR, IB_WC_REM_ACCESS_ERR },
+		{ HNS_ROCE_CQE_V2_REMOTE_OP_ERR, IB_WC_REM_OP_ERR },
+		{ HNS_ROCE_CQE_V2_TRANSPORT_RETRY_EXC_ERR,
+		  IB_WC_RETRY_EXC_ERR },
+		{ HNS_ROCE_CQE_V2_RNR_RETRY_EXC_ERR, IB_WC_RNR_RETRY_EXC_ERR },
+		{ HNS_ROCE_CQE_V2_REMOTE_ABORT_ERR, IB_WC_REM_ABORT_ERR },
+		{ HNS_ROCE_CQE_V2_GENERAL_ERR, IB_WC_GENERAL_ERR}
+	};
+
+	u32 cqe_status = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_STATUS_M,
+					V2_CQE_BYTE_4_STATUS_S);
+	int i;
+
+	wc->status = IB_WC_GENERAL_ERR;
+	for (i = 0; i < ARRAY_SIZE(map); i++)
+		if (cqe_status == map[i].cqe_status) {
+			wc->status = map[i].wc_status;
+			break;
+		}
+
+	if (likely(wc->status == IB_WC_SUCCESS ||
+		   wc->status == IB_WC_WR_FLUSH_ERR))
+		return;
+
+	ibdev_err(&hr_dev->ib_dev, "error cqe status 0x%x:\n", cqe_status);
+	print_hex_dump(KERN_ERR, "", DUMP_PREFIX_NONE, 16, 4, cqe,
+		       cq->cqe_size, false);
+
+	/*
+	 * For hns ROCEE, GENERAL_ERR is an error type that is not defined in
+	 * the standard protocol, the driver must ignore it and needn't to set
+	 * the QP to an error state.
+	 */
+	if (cqe_status == HNS_ROCE_CQE_V2_GENERAL_ERR)
+		return;
+
+	/*
+	 * Hip08 hardware cannot flush the WQEs in SQ/RQ if the QP state gets
+	 * into errored mode. Hence, as a workaround to this hardware
+	 * limitation, driver needs to assist in flushing. But the flushing
+	 * operation uses mailbox to convey the QP state to the hardware and
+	 * which can sleep due to the mutex protection around the mailbox calls.
+	 * Hence, use the deferred flush for now. Once wc error detected, the
+	 * flushing operation is needed.
+	 */
+	if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG, &qp->flush_flag))
+		init_flush_work(hr_dev, qp);
+}
+
 static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq,
 				struct hns_roce_qp **cur_qp, struct ib_wc *wc)
 {
+	struct hns_roce_dev *hr_dev = to_hr_dev(hr_cq->ib_cq.device);
 	struct hns_roce_srq *srq = NULL;
-	struct hns_roce_dev *hr_dev;
 	struct hns_roce_v2_cqe *cqe;
 	struct hns_roce_qp *hr_qp;
 	struct hns_roce_wq *wq;
-	struct ib_qp_attr attr;
-	int attr_mask;
 	int is_send;
 	u16 wqe_ctr;
 	u32 opcode;
-	u32 status;
 	int qpn;
 	int ret;
 
 	/* Find cqe according to consumer index */
-	cqe = next_cqe_sw_v2(hr_cq);
+	cqe = get_sw_cqe_v2(hr_cq, hr_cq->cons_index);
 	if (!cqe)
 		return -EAGAIN;
 
@@ -2716,11 +3322,11 @@
 				V2_CQE_BYTE_16_LCL_QPN_S);
 
 	if (!*cur_qp || (qpn & HNS_ROCE_V2_CQE_QPN_MASK) != (*cur_qp)->qpn) {
-		hr_dev = to_hr_dev(hr_cq->ib_cq.device);
 		hr_qp = __hns_roce_qp_lookup(hr_dev, qpn);
 		if (unlikely(!hr_qp)) {
-			dev_err(hr_dev->dev, "CQ %06lx with entry for unknown QPN %06x\n",
-				hr_cq->cqn, (qpn & HNS_ROCE_V2_CQE_QPN_MASK));
+			ibdev_err(&hr_dev->ib_dev,
+				  "CQ %06lx with entry for unknown QPN %06x\n",
+				  hr_cq->cqn, qpn & HNS_ROCE_V2_CQE_QPN_MASK);
 			return -EINVAL;
 		}
 		*cur_qp = hr_qp;
@@ -2760,67 +3366,8 @@
 		++wq->tail;
 	}
 
-	status = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_STATUS_M,
-				V2_CQE_BYTE_4_STATUS_S);
-	switch (status & HNS_ROCE_V2_CQE_STATUS_MASK) {
-	case HNS_ROCE_CQE_V2_SUCCESS:
-		wc->status = IB_WC_SUCCESS;
-		break;
-	case HNS_ROCE_CQE_V2_LOCAL_LENGTH_ERR:
-		wc->status = IB_WC_LOC_LEN_ERR;
-		break;
-	case HNS_ROCE_CQE_V2_LOCAL_QP_OP_ERR:
-		wc->status = IB_WC_LOC_QP_OP_ERR;
-		break;
-	case HNS_ROCE_CQE_V2_LOCAL_PROT_ERR:
-		wc->status = IB_WC_LOC_PROT_ERR;
-		break;
-	case HNS_ROCE_CQE_V2_WR_FLUSH_ERR:
-		wc->status = IB_WC_WR_FLUSH_ERR;
-		break;
-	case HNS_ROCE_CQE_V2_MW_BIND_ERR:
-		wc->status = IB_WC_MW_BIND_ERR;
-		break;
-	case HNS_ROCE_CQE_V2_BAD_RESP_ERR:
-		wc->status = IB_WC_BAD_RESP_ERR;
-		break;
-	case HNS_ROCE_CQE_V2_LOCAL_ACCESS_ERR:
-		wc->status = IB_WC_LOC_ACCESS_ERR;
-		break;
-	case HNS_ROCE_CQE_V2_REMOTE_INVAL_REQ_ERR:
-		wc->status = IB_WC_REM_INV_REQ_ERR;
-		break;
-	case HNS_ROCE_CQE_V2_REMOTE_ACCESS_ERR:
-		wc->status = IB_WC_REM_ACCESS_ERR;
-		break;
-	case HNS_ROCE_CQE_V2_REMOTE_OP_ERR:
-		wc->status = IB_WC_REM_OP_ERR;
-		break;
-	case HNS_ROCE_CQE_V2_TRANSPORT_RETRY_EXC_ERR:
-		wc->status = IB_WC_RETRY_EXC_ERR;
-		break;
-	case HNS_ROCE_CQE_V2_RNR_RETRY_EXC_ERR:
-		wc->status = IB_WC_RNR_RETRY_EXC_ERR;
-		break;
-	case HNS_ROCE_CQE_V2_REMOTE_ABORT_ERR:
-		wc->status = IB_WC_REM_ABORT_ERR;
-		break;
-	default:
-		wc->status = IB_WC_GENERAL_ERR;
-		break;
-	}
-
-	/* flush cqe if wc status is error, excluding flush error */
-	if ((wc->status != IB_WC_SUCCESS) &&
-	    (wc->status != IB_WC_WR_FLUSH_ERR)) {
-		attr_mask = IB_QP_STATE;
-		attr.qp_state = IB_QPS_ERR;
-		return hns_roce_v2_modify_qp(&(*cur_qp)->ibqp,
-					     &attr, attr_mask,
-					     (*cur_qp)->state, IB_QPS_ERR);
-	}
-
-	if (wc->status == IB_WC_WR_FLUSH_ERR)
+	get_cqe_status(hr_dev, *cur_qp, hr_cq, cqe, wc);
+	if (unlikely(wc->status != IB_WC_SUCCESS))
 		return 0;
 
 	if (is_send) {
@@ -2828,51 +3375,51 @@
 		/* SQ corresponding to CQE */
 		switch (roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_OPCODE_M,
 				       V2_CQE_BYTE_4_OPCODE_S) & 0x1f) {
-		case HNS_ROCE_SQ_OPCODE_SEND:
+		case HNS_ROCE_V2_WQE_OP_SEND:
 			wc->opcode = IB_WC_SEND;
 			break;
-		case HNS_ROCE_SQ_OPCODE_SEND_WITH_INV:
+		case HNS_ROCE_V2_WQE_OP_SEND_WITH_INV:
 			wc->opcode = IB_WC_SEND;
 			break;
-		case HNS_ROCE_SQ_OPCODE_SEND_WITH_IMM:
+		case HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM:
 			wc->opcode = IB_WC_SEND;
 			wc->wc_flags |= IB_WC_WITH_IMM;
 			break;
-		case HNS_ROCE_SQ_OPCODE_RDMA_READ:
+		case HNS_ROCE_V2_WQE_OP_RDMA_READ:
 			wc->opcode = IB_WC_RDMA_READ;
 			wc->byte_len = le32_to_cpu(cqe->byte_cnt);
 			break;
-		case HNS_ROCE_SQ_OPCODE_RDMA_WRITE:
+		case HNS_ROCE_V2_WQE_OP_RDMA_WRITE:
 			wc->opcode = IB_WC_RDMA_WRITE;
 			break;
-		case HNS_ROCE_SQ_OPCODE_RDMA_WRITE_WITH_IMM:
+		case HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM:
 			wc->opcode = IB_WC_RDMA_WRITE;
 			wc->wc_flags |= IB_WC_WITH_IMM;
 			break;
-		case HNS_ROCE_SQ_OPCODE_LOCAL_INV:
+		case HNS_ROCE_V2_WQE_OP_LOCAL_INV:
 			wc->opcode = IB_WC_LOCAL_INV;
 			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
 			break;
-		case HNS_ROCE_SQ_OPCODE_ATOMIC_COMP_AND_SWAP:
+		case HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP:
 			wc->opcode = IB_WC_COMP_SWAP;
 			wc->byte_len  = 8;
 			break;
-		case HNS_ROCE_SQ_OPCODE_ATOMIC_FETCH_AND_ADD:
+		case HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD:
 			wc->opcode = IB_WC_FETCH_ADD;
 			wc->byte_len  = 8;
 			break;
-		case HNS_ROCE_SQ_OPCODE_ATOMIC_MASK_COMP_AND_SWAP:
+		case HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP:
 			wc->opcode = IB_WC_MASKED_COMP_SWAP;
 			wc->byte_len  = 8;
 			break;
-		case HNS_ROCE_SQ_OPCODE_ATOMIC_MASK_FETCH_AND_ADD:
+		case HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD:
 			wc->opcode = IB_WC_MASKED_FETCH_ADD;
 			wc->byte_len  = 8;
 			break;
-		case HNS_ROCE_SQ_OPCODE_FAST_REG_WR:
+		case HNS_ROCE_V2_WQE_OP_FAST_REG_PMR:
 			wc->opcode = IB_WC_REG_MR;
 			break;
-		case HNS_ROCE_SQ_OPCODE_BIND_MW:
+		case HNS_ROCE_V2_WQE_OP_BIND_MW:
 			wc->opcode = IB_WC_REG_MR;
 			break;
 		default:
@@ -2919,7 +3466,7 @@
 		    opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_INV) &&
 		    (roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_RQ_INLINE_S))) {
 			ret = hns_roce_handle_recv_inl_wqe(cqe, cur_qp, wc);
-			if (ret)
+			if (unlikely(ret))
 				return -EAGAIN;
 		}
 
@@ -2935,14 +3482,7 @@
 		wc->port_num = roce_get_field(cqe->byte_32,
 				V2_CQE_BYTE_32_PORTN_M, V2_CQE_BYTE_32_PORTN_S);
 		wc->pkey_index = 0;
-		memcpy(wc->smac, cqe->smac, 4);
-		wc->smac[4] = roce_get_field(cqe->byte_28,
-					     V2_CQE_BYTE_28_SMAC_4_M,
-					     V2_CQE_BYTE_28_SMAC_4_S);
-		wc->smac[5] = roce_get_field(cqe->byte_28,
-					     V2_CQE_BYTE_28_SMAC_5_M,
-					     V2_CQE_BYTE_28_SMAC_5_S);
-		wc->wc_flags |= IB_WC_WITH_SMAC;
+
 		if (roce_get_bit(cqe->byte_28, V2_CQE_BYTE_28_VID_VLD_S)) {
 			wc->vlan_id = (u16)roce_get_field(cqe->byte_28,
 							  V2_CQE_BYTE_28_VID_M,
@@ -2963,6 +3503,7 @@
 static int hns_roce_v2_poll_cq(struct ib_cq *ibcq, int num_entries,
 			       struct ib_wc *wc)
 {
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibcq->device);
 	struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
 	struct hns_roce_qp *cur_qp = NULL;
 	unsigned long flags;
@@ -2970,6 +3511,18 @@
 
 	spin_lock_irqsave(&hr_cq->lock, flags);
 
+	/*
+	 * When the device starts to reset, the state is RST_DOWN. At this time,
+	 * there may still be some valid CQEs in the hardware that are not
+	 * polled. Therefore, it is not allowed to switch to the software mode
+	 * immediately. When the state changes to UNINIT, CQE no longer exists
+	 * in the hardware, and then switch to software mode.
+	 */
+	if (hr_dev->state == HNS_ROCE_DEVICE_STATE_UNINIT) {
+		npolled = hns_roce_v2_sw_poll_cq(hr_cq, num_entries, wc);
+		goto out;
+	}
+
 	for (npolled = 0; npolled < num_entries; ++npolled) {
 		if (hns_roce_v2_poll_one(hr_cq, &cur_qp, wc + npolled))
 			break;
@@ -2981,6 +3534,7 @@
 		hns_roce_v2_cq_set_ci(hr_cq, hr_cq->cons_index);
 	}
 
+out:
 	spin_unlock_irqrestore(&hr_cq->lock, flags);
 
 	return npolled;
@@ -3018,18 +3572,40 @@
 		break;
 	default:
 		dev_warn(hr_dev->dev,
-			 "Table %d not to be written by mailbox!\n", type);
+			 "table %u not to be written by mailbox!\n", type);
 		return -EINVAL;
 	}
 
 	return op + step_idx;
 }
 
+static int set_hem_to_hw(struct hns_roce_dev *hr_dev, int obj, u64 bt_ba,
+			 u32 hem_type, int step_idx)
+{
+	struct hns_roce_cmd_mailbox *mailbox;
+	int ret;
+	int op;
+
+	op = get_op_for_set_hem(hr_dev, hem_type, step_idx);
+	if (op < 0)
+		return 0;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	ret = hns_roce_cmd_mbox(hr_dev, bt_ba, mailbox->dma, obj,
+				0, op, HNS_ROCE_CMD_TIMEOUT_MSECS);
+
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+	return ret;
+}
+
 static int hns_roce_v2_set_hem(struct hns_roce_dev *hr_dev,
 			       struct hns_roce_hem_table *table, int obj,
 			       int step_idx)
 {
-	struct hns_roce_cmd_mailbox *mailbox;
 	struct hns_roce_hem_iter iter;
 	struct hns_roce_hem_mhop mhop;
 	struct hns_roce_hem *hem;
@@ -3041,7 +3617,6 @@
 	u64 bt_ba = 0;
 	u32 chunk_ba_num;
 	u32 hop_num;
-	int op;
 
 	if (!hns_roce_check_whether_mhop(hr_dev, table->type))
 		return 0;
@@ -3063,14 +3638,6 @@
 		hem_idx = i;
 	}
 
-	op = get_op_for_set_hem(hr_dev, table->type, step_idx);
-	if (op == -EINVAL)
-		return 0;
-
-	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
-	if (IS_ERR(mailbox))
-		return PTR_ERR(mailbox);
-
 	if (table->type == HEM_TYPE_SCCC)
 		obj = mhop.l0_idx;
 
@@ -3079,11 +3646,8 @@
 		for (hns_roce_hem_first(hem, &iter);
 		     !hns_roce_hem_last(&iter); hns_roce_hem_next(&iter)) {
 			bt_ba = hns_roce_hem_addr(&iter);
-
-			/* configure the ba, tag, and op */
-			ret = hns_roce_cmd_mbox(hr_dev, bt_ba, mailbox->dma,
-						obj, 0, op,
-						HNS_ROCE_CMD_TIMEOUT_MSECS);
+			ret = set_hem_to_hw(hr_dev, obj, bt_ba, table->type,
+					    step_idx);
 		}
 	} else {
 		if (step_idx == 0)
@@ -3091,12 +3655,9 @@
 		else if (step_idx == 1 && hop_num == 2)
 			bt_ba = table->bt_l1_dma_addr[l1_idx];
 
-		/* configure the ba, tag, and op */
-		ret = hns_roce_cmd_mbox(hr_dev, bt_ba, mailbox->dma, obj,
-					0, op, HNS_ROCE_CMD_TIMEOUT_MSECS);
+		ret = set_hem_to_hw(hr_dev, obj, bt_ba, table->type, step_idx);
 	}
 
-	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
 	return ret;
 }
 
@@ -3130,7 +3691,7 @@
 		op = HNS_ROCE_CMD_DESTROY_SRQC_BT0;
 		break;
 	default:
-		dev_warn(dev, "Table %d not to be destroyed by mailbox!\n",
+		dev_warn(dev, "table %u not to be destroyed by mailbox!\n",
 			 table->type);
 		return 0;
 	}
@@ -3155,19 +3716,22 @@
 }
 
 static int hns_roce_v2_qp_modify(struct hns_roce_dev *hr_dev,
-				 enum ib_qp_state cur_state,
-				 enum ib_qp_state new_state,
 				 struct hns_roce_v2_qp_context *context,
+				 struct hns_roce_v2_qp_context *qpc_mask,
 				 struct hns_roce_qp *hr_qp)
 {
 	struct hns_roce_cmd_mailbox *mailbox;
+	int qpc_size;
 	int ret;
 
 	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
 
-	memcpy(mailbox->buf, context, sizeof(*context) * 2);
+	/* The qpc size of HIP08 is only 256B, which is half of HIP09 */
+	qpc_size = hr_dev->caps.qpc_sz;
+	memcpy(mailbox->buf, context, qpc_size);
+	memcpy(mailbox->buf + qpc_size, qpc_mask, qpc_size);
 
 	ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, hr_qp->qpn, 0,
 				HNS_ROCE_CMD_MODIFY_QPC,
@@ -3206,43 +3770,27 @@
 	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S,
 		     !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
 	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S, 0);
+	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_EXT_ATE_S,
+		     !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
+	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_EXT_ATE_S, 0);
 }
 
 static void set_qpc_wqe_cnt(struct hns_roce_qp *hr_qp,
 			    struct hns_roce_v2_qp_context *context,
 			    struct hns_roce_v2_qp_context *qpc_mask)
 {
-	if (hr_qp->ibqp.qp_type == IB_QPT_GSI)
-		roce_set_field(context->byte_4_sqpn_tst,
-			       V2_QPC_BYTE_4_SGE_SHIFT_M,
-			       V2_QPC_BYTE_4_SGE_SHIFT_S,
-			       ilog2((unsigned int)hr_qp->sge.sge_cnt));
-	else
-		roce_set_field(context->byte_4_sqpn_tst,
-			       V2_QPC_BYTE_4_SGE_SHIFT_M,
-			       V2_QPC_BYTE_4_SGE_SHIFT_S,
-			       hr_qp->sq.max_gs >
-			       HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE ?
-			       ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0);
-
-	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M,
-		       V2_QPC_BYTE_4_SGE_SHIFT_S, 0);
+	roce_set_field(context->byte_4_sqpn_tst,
+		       V2_QPC_BYTE_4_SGE_SHIFT_M, V2_QPC_BYTE_4_SGE_SHIFT_S,
+		       to_hr_hem_entries_shift(hr_qp->sge.sge_cnt,
+					       hr_qp->sge.sge_shift));
 
 	roce_set_field(context->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S,
-		       ilog2((unsigned int)hr_qp->sq.wqe_cnt));
-	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_SQ_SHIFT_M, V2_QPC_BYTE_20_SQ_SHIFT_S, 0);
+		       ilog2(hr_qp->sq.wqe_cnt));
 
 	roce_set_field(context->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S,
-		       (hr_qp->ibqp.qp_type == IB_QPT_XRC_INI ||
-		       hr_qp->ibqp.qp_type == IB_QPT_XRC_TGT ||
-		       hr_qp->ibqp.srq) ? 0 :
-		       ilog2((unsigned int)hr_qp->rq.wqe_cnt));
-
-	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_RQ_SHIFT_M, V2_QPC_BYTE_20_RQ_SHIFT_S, 0);
+		       ilog2(hr_qp->rq.wqe_cnt));
 }
 
 static void modify_qp_reset_to_init(struct ib_qp *ibqp,
@@ -3262,280 +3810,50 @@
 	 */
 	roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
 		       V2_QPC_BYTE_4_TST_S, to_hr_qp_type(hr_qp->ibqp.qp_type));
-	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M,
-		       V2_QPC_BYTE_4_TST_S, 0);
 
 	roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
 		       V2_QPC_BYTE_4_SQPN_S, hr_qp->qpn);
-	roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
-		       V2_QPC_BYTE_4_SQPN_S, 0);
 
 	roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
 		       V2_QPC_BYTE_16_PD_S, to_hr_pd(ibqp->pd)->pdn);
-	roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
-		       V2_QPC_BYTE_16_PD_S, 0);
 
 	roce_set_field(context->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_RQWS_M,
 		       V2_QPC_BYTE_20_RQWS_S, ilog2(hr_qp->rq.max_gs));
-	roce_set_field(qpc_mask->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_RQWS_M,
-		       V2_QPC_BYTE_20_RQWS_S, 0);
 
 	set_qpc_wqe_cnt(hr_qp, context, qpc_mask);
 
 	/* No VLAN need to set 0xFFF */
 	roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M,
 		       V2_QPC_BYTE_24_VLAN_ID_S, 0xfff);
-	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M,
-		       V2_QPC_BYTE_24_VLAN_ID_S, 0);
 
-	/*
-	 * Set some fields in context to zero, Because the default values
-	 * of all fields in context are zero, we need not set them to 0 again.
-	 * but we should set the relevant fields of context mask to 0.
-	 */
-	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_SQ_TX_ERR_S, 0);
-	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_SQ_RX_ERR_S, 0);
-	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_RQ_TX_ERR_S, 0);
-	roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_RQ_RX_ERR_S, 0);
-
-	roce_set_field(qpc_mask->byte_60_qpst_tempid, V2_QPC_BYTE_60_TEMPID_M,
-		       V2_QPC_BYTE_60_TEMPID_S, 0);
-
-	roce_set_field(qpc_mask->byte_60_qpst_tempid,
-		       V2_QPC_BYTE_60_SCC_TOKEN_M, V2_QPC_BYTE_60_SCC_TOKEN_S,
-		       0);
-	roce_set_bit(qpc_mask->byte_60_qpst_tempid,
-		     V2_QPC_BYTE_60_SQ_DB_DOING_S, 0);
-	roce_set_bit(qpc_mask->byte_60_qpst_tempid,
-		     V2_QPC_BYTE_60_RQ_DB_DOING_S, 0);
-	roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CNP_TX_FLAG_S, 0);
-	roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CE_FLAG_S, 0);
-
-	if (hr_qp->rdb_en) {
+	if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB)
 		roce_set_bit(context->byte_68_rq_db,
 			     V2_QPC_BYTE_68_RQ_RECORD_EN_S, 1);
-		roce_set_bit(qpc_mask->byte_68_rq_db,
-			     V2_QPC_BYTE_68_RQ_RECORD_EN_S, 0);
-	}
 
 	roce_set_field(context->byte_68_rq_db,
 		       V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_M,
 		       V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_S,
 		       ((u32)hr_qp->rdb.dma) >> 1);
-	roce_set_field(qpc_mask->byte_68_rq_db,
-		       V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_M,
-		       V2_QPC_BYTE_68_RQ_DB_RECORD_ADDR_S, 0);
 	context->rq_db_record_addr = cpu_to_le32(hr_qp->rdb.dma >> 32);
-	qpc_mask->rq_db_record_addr = 0;
 
 	roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S,
 		    (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) ? 1 : 0);
-	roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 0);
 
 	roce_set_field(context->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M,
 		       V2_QPC_BYTE_80_RX_CQN_S, to_hr_cq(ibqp->recv_cq)->cqn);
-	roce_set_field(qpc_mask->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M,
-		       V2_QPC_BYTE_80_RX_CQN_S, 0);
 	if (ibqp->srq) {
 		roce_set_field(context->byte_76_srqn_op_en,
 			       V2_QPC_BYTE_76_SRQN_M, V2_QPC_BYTE_76_SRQN_S,
 			       to_hr_srq(ibqp->srq)->srqn);
-		roce_set_field(qpc_mask->byte_76_srqn_op_en,
-			       V2_QPC_BYTE_76_SRQN_M, V2_QPC_BYTE_76_SRQN_S, 0);
 		roce_set_bit(context->byte_76_srqn_op_en,
 			     V2_QPC_BYTE_76_SRQ_EN_S, 1);
-		roce_set_bit(qpc_mask->byte_76_srqn_op_en,
-			     V2_QPC_BYTE_76_SRQ_EN_S, 0);
 	}
 
-	roce_set_field(qpc_mask->byte_84_rq_ci_pi,
-		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
-		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
-	roce_set_field(qpc_mask->byte_84_rq_ci_pi,
-		       V2_QPC_BYTE_84_RQ_CONSUMER_IDX_M,
-		       V2_QPC_BYTE_84_RQ_CONSUMER_IDX_S, 0);
-
-	roce_set_field(qpc_mask->byte_92_srq_info, V2_QPC_BYTE_92_SRQ_INFO_M,
-		       V2_QPC_BYTE_92_SRQ_INFO_S, 0);
-
-	roce_set_field(qpc_mask->byte_96_rx_reqmsn, V2_QPC_BYTE_96_RX_REQ_MSN_M,
-		       V2_QPC_BYTE_96_RX_REQ_MSN_S, 0);
-
-	roce_set_field(qpc_mask->byte_104_rq_sge,
-		       V2_QPC_BYTE_104_RQ_CUR_WQE_SGE_NUM_M,
-		       V2_QPC_BYTE_104_RQ_CUR_WQE_SGE_NUM_S, 0);
-
-	roce_set_bit(qpc_mask->byte_108_rx_reqepsn,
-		     V2_QPC_BYTE_108_RX_REQ_PSN_ERR_S, 0);
-	roce_set_field(qpc_mask->byte_108_rx_reqepsn,
-		       V2_QPC_BYTE_108_RX_REQ_LAST_OPTYPE_M,
-		       V2_QPC_BYTE_108_RX_REQ_LAST_OPTYPE_S, 0);
-	roce_set_bit(qpc_mask->byte_108_rx_reqepsn,
-		     V2_QPC_BYTE_108_RX_REQ_RNR_S, 0);
-
-	qpc_mask->rq_rnr_timer = 0;
-	qpc_mask->rx_msg_len = 0;
-	qpc_mask->rx_rkey_pkt_info = 0;
-	qpc_mask->rx_va = 0;
-
-	roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_HEAD_MAX_M,
-		       V2_QPC_BYTE_132_TRRL_HEAD_MAX_S, 0);
-	roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_TAIL_MAX_M,
-		       V2_QPC_BYTE_132_TRRL_TAIL_MAX_S, 0);
-
-	roce_set_bit(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RQ_RTY_WAIT_DO_S,
-		     0);
-	roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RAQ_TRRL_HEAD_M,
-		       V2_QPC_BYTE_140_RAQ_TRRL_HEAD_S, 0);
-	roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RAQ_TRRL_TAIL_M,
-		       V2_QPC_BYTE_140_RAQ_TRRL_TAIL_S, 0);
-
-	roce_set_field(qpc_mask->byte_144_raq,
-		       V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_M,
-		       V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_S, 0);
-	roce_set_field(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RAQ_CREDIT_M,
-		       V2_QPC_BYTE_144_RAQ_CREDIT_S, 0);
-	roce_set_bit(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RESP_RTY_FLG_S, 0);
-
-	roce_set_field(qpc_mask->byte_148_raq, V2_QPC_BYTE_148_RQ_MSN_M,
-		       V2_QPC_BYTE_148_RQ_MSN_S, 0);
-	roce_set_field(qpc_mask->byte_148_raq, V2_QPC_BYTE_148_RAQ_SYNDROME_M,
-		       V2_QPC_BYTE_148_RAQ_SYNDROME_S, 0);
-
-	roce_set_field(qpc_mask->byte_152_raq, V2_QPC_BYTE_152_RAQ_PSN_M,
-		       V2_QPC_BYTE_152_RAQ_PSN_S, 0);
-	roce_set_field(qpc_mask->byte_152_raq,
-		       V2_QPC_BYTE_152_RAQ_TRRL_RTY_HEAD_M,
-		       V2_QPC_BYTE_152_RAQ_TRRL_RTY_HEAD_S, 0);
-
-	roce_set_field(qpc_mask->byte_156_raq, V2_QPC_BYTE_156_RAQ_USE_PKTN_M,
-		       V2_QPC_BYTE_156_RAQ_USE_PKTN_S, 0);
-
-	roce_set_field(qpc_mask->byte_160_sq_ci_pi,
-		       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
-		       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S, 0);
-	roce_set_field(qpc_mask->byte_160_sq_ci_pi,
-		       V2_QPC_BYTE_160_SQ_CONSUMER_IDX_M,
-		       V2_QPC_BYTE_160_SQ_CONSUMER_IDX_S, 0);
-
-	roce_set_bit(qpc_mask->byte_168_irrl_idx,
-		     V2_QPC_BYTE_168_POLL_DB_WAIT_DO_S, 0);
-	roce_set_bit(qpc_mask->byte_168_irrl_idx,
-		     V2_QPC_BYTE_168_SCC_TOKEN_FORBID_SQ_DEQ_S, 0);
-	roce_set_bit(qpc_mask->byte_168_irrl_idx,
-		     V2_QPC_BYTE_168_WAIT_ACK_TIMEOUT_S, 0);
-	roce_set_bit(qpc_mask->byte_168_irrl_idx,
-		     V2_QPC_BYTE_168_MSG_RTY_LP_FLG_S, 0);
-	roce_set_bit(qpc_mask->byte_168_irrl_idx,
-		     V2_QPC_BYTE_168_SQ_INVLD_FLG_S, 0);
-	roce_set_field(qpc_mask->byte_168_irrl_idx,
-		       V2_QPC_BYTE_168_IRRL_IDX_LSB_M,
-		       V2_QPC_BYTE_168_IRRL_IDX_LSB_S, 0);
-
-	roce_set_field(context->byte_172_sq_psn, V2_QPC_BYTE_172_ACK_REQ_FREQ_M,
-		       V2_QPC_BYTE_172_ACK_REQ_FREQ_S, 4);
-	roce_set_field(qpc_mask->byte_172_sq_psn,
-		       V2_QPC_BYTE_172_ACK_REQ_FREQ_M,
-		       V2_QPC_BYTE_172_ACK_REQ_FREQ_S, 0);
-
-	roce_set_bit(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_MSG_RNR_FLG_S,
-		     0);
-
 	roce_set_bit(context->byte_172_sq_psn, V2_QPC_BYTE_172_FRE_S, 1);
-	roce_set_bit(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_FRE_S, 0);
-
-	roce_set_field(qpc_mask->byte_176_msg_pktn,
-		       V2_QPC_BYTE_176_MSG_USE_PKTN_M,
-		       V2_QPC_BYTE_176_MSG_USE_PKTN_S, 0);
-	roce_set_field(qpc_mask->byte_176_msg_pktn,
-		       V2_QPC_BYTE_176_IRRL_HEAD_PRE_M,
-		       V2_QPC_BYTE_176_IRRL_HEAD_PRE_S, 0);
-
-	roce_set_field(qpc_mask->byte_184_irrl_idx,
-		       V2_QPC_BYTE_184_IRRL_IDX_MSB_M,
-		       V2_QPC_BYTE_184_IRRL_IDX_MSB_S, 0);
-
-	qpc_mask->cur_sge_offset = 0;
-
-	roce_set_field(qpc_mask->byte_192_ext_sge,
-		       V2_QPC_BYTE_192_CUR_SGE_IDX_M,
-		       V2_QPC_BYTE_192_CUR_SGE_IDX_S, 0);
-	roce_set_field(qpc_mask->byte_192_ext_sge,
-		       V2_QPC_BYTE_192_EXT_SGE_NUM_LEFT_M,
-		       V2_QPC_BYTE_192_EXT_SGE_NUM_LEFT_S, 0);
-
-	roce_set_field(qpc_mask->byte_196_sq_psn, V2_QPC_BYTE_196_IRRL_HEAD_M,
-		       V2_QPC_BYTE_196_IRRL_HEAD_S, 0);
-
-	roce_set_field(qpc_mask->byte_200_sq_max, V2_QPC_BYTE_200_SQ_MAX_IDX_M,
-		       V2_QPC_BYTE_200_SQ_MAX_IDX_S, 0);
-	roce_set_field(qpc_mask->byte_200_sq_max,
-		       V2_QPC_BYTE_200_LCL_OPERATED_CNT_M,
-		       V2_QPC_BYTE_200_LCL_OPERATED_CNT_S, 0);
-
-	roce_set_bit(qpc_mask->byte_208_irrl, V2_QPC_BYTE_208_PKT_RNR_FLG_S, 0);
-	roce_set_bit(qpc_mask->byte_208_irrl, V2_QPC_BYTE_208_PKT_RTY_FLG_S, 0);
-
-	roce_set_field(qpc_mask->byte_212_lsn, V2_QPC_BYTE_212_CHECK_FLG_M,
-		       V2_QPC_BYTE_212_CHECK_FLG_S, 0);
-
-	qpc_mask->sq_timer = 0;
-
-	roce_set_field(qpc_mask->byte_220_retry_psn_msn,
-		       V2_QPC_BYTE_220_RETRY_MSG_MSN_M,
-		       V2_QPC_BYTE_220_RETRY_MSG_MSN_S, 0);
-	roce_set_field(qpc_mask->byte_232_irrl_sge,
-		       V2_QPC_BYTE_232_IRRL_SGE_IDX_M,
-		       V2_QPC_BYTE_232_IRRL_SGE_IDX_S, 0);
-
-	roce_set_bit(qpc_mask->byte_232_irrl_sge, V2_QPC_BYTE_232_SO_LP_VLD_S,
-		     0);
-	roce_set_bit(qpc_mask->byte_232_irrl_sge,
-		     V2_QPC_BYTE_232_FENCE_LP_VLD_S, 0);
-	roce_set_bit(qpc_mask->byte_232_irrl_sge, V2_QPC_BYTE_232_IRRL_LP_VLD_S,
-		     0);
-
-	qpc_mask->irrl_cur_sge_offset = 0;
-
-	roce_set_field(qpc_mask->byte_240_irrl_tail,
-		       V2_QPC_BYTE_240_IRRL_TAIL_REAL_M,
-		       V2_QPC_BYTE_240_IRRL_TAIL_REAL_S, 0);
-	roce_set_field(qpc_mask->byte_240_irrl_tail,
-		       V2_QPC_BYTE_240_IRRL_TAIL_RD_M,
-		       V2_QPC_BYTE_240_IRRL_TAIL_RD_S, 0);
-	roce_set_field(qpc_mask->byte_240_irrl_tail,
-		       V2_QPC_BYTE_240_RX_ACK_MSN_M,
-		       V2_QPC_BYTE_240_RX_ACK_MSN_S, 0);
-
-	roce_set_field(qpc_mask->byte_248_ack_psn, V2_QPC_BYTE_248_IRRL_PSN_M,
-		       V2_QPC_BYTE_248_IRRL_PSN_S, 0);
-	roce_set_bit(qpc_mask->byte_248_ack_psn, V2_QPC_BYTE_248_ACK_PSN_ERR_S,
-		     0);
-	roce_set_field(qpc_mask->byte_248_ack_psn,
-		       V2_QPC_BYTE_248_ACK_LAST_OPTYPE_M,
-		       V2_QPC_BYTE_248_ACK_LAST_OPTYPE_S, 0);
-	roce_set_bit(qpc_mask->byte_248_ack_psn, V2_QPC_BYTE_248_IRRL_PSN_VLD_S,
-		     0);
-	roce_set_bit(qpc_mask->byte_248_ack_psn,
-		     V2_QPC_BYTE_248_RNR_RETRY_FLAG_S, 0);
-	roce_set_bit(qpc_mask->byte_248_ack_psn, V2_QPC_BYTE_248_CQ_ERR_IND_S,
-		     0);
 
 	hr_qp->access_flags = attr->qp_access_flags;
 	roce_set_field(context->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
 		       V2_QPC_BYTE_252_TX_CQN_S, to_hr_cq(ibqp->send_cq)->cqn);
-	roce_set_field(qpc_mask->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
-		       V2_QPC_BYTE_252_TX_CQN_S, 0);
-
-	roce_set_field(qpc_mask->byte_252_err_txcqn, V2_QPC_BYTE_252_ERR_TYPE_M,
-		       V2_QPC_BYTE_252_ERR_TYPE_S, 0);
-
-	roce_set_field(qpc_mask->byte_256_sqflush_rqcqe,
-		       V2_QPC_BYTE_256_RQ_CQE_IDX_M,
-		       V2_QPC_BYTE_256_RQ_CQE_IDX_S, 0);
-	roce_set_field(qpc_mask->byte_256_sqflush_rqcqe,
-		       V2_QPC_BYTE_256_SQ_FLUSH_IDX_M,
-		       V2_QPC_BYTE_256_SQ_FLUSH_IDX_S, 0);
 }
 
 static void modify_qp_init_to_init(struct ib_qp *ibqp,
@@ -3573,6 +3891,12 @@
 			     IB_ACCESS_REMOTE_ATOMIC));
 		roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S,
 			     0);
+		roce_set_bit(context->byte_76_srqn_op_en,
+			     V2_QPC_BYTE_76_EXT_ATE_S,
+			     !!(attr->qp_access_flags &
+				IB_ACCESS_REMOTE_ATOMIC));
+		roce_set_bit(qpc_mask->byte_76_srqn_op_en,
+			     V2_QPC_BYTE_76_EXT_ATE_S, 0);
 	} else {
 		roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S,
 			     !!(hr_qp->access_flags & IB_ACCESS_REMOTE_READ));
@@ -3588,6 +3912,11 @@
 			     !!(hr_qp->access_flags & IB_ACCESS_REMOTE_ATOMIC));
 		roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S,
 			     0);
+		roce_set_bit(context->byte_76_srqn_op_en,
+			     V2_QPC_BYTE_76_EXT_ATE_S,
+			     !!(hr_qp->access_flags & IB_ACCESS_REMOTE_ATOMIC));
+		roce_set_bit(qpc_mask->byte_76_srqn_op_en,
+			     V2_QPC_BYTE_76_EXT_ATE_S, 0);
 	}
 
 	roce_set_field(context->byte_16_buf_ba_pg_sz, V2_QPC_BYTE_16_PD_M,
@@ -3630,84 +3959,24 @@
 	}
 }
 
-static bool check_wqe_rq_mtt_count(struct hns_roce_dev *hr_dev,
-				   struct hns_roce_qp *hr_qp, int mtt_cnt,
-				   u32 page_size)
+static int config_qp_rq_buf(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_qp *hr_qp,
+			    struct hns_roce_v2_qp_context *context,
+			    struct hns_roce_v2_qp_context *qpc_mask)
 {
-	struct device *dev = hr_dev->dev;
-
-	if (hr_qp->rq.wqe_cnt < 1)
-		return true;
-
-	if (mtt_cnt < 1) {
-		dev_err(dev, "qp(0x%lx) rqwqe buf ba find failed\n",
-			hr_qp->qpn);
-		return false;
-	}
-
-	if (mtt_cnt < MTT_MIN_COUNT &&
-		(hr_qp->rq.offset + page_size) < hr_qp->buff_size) {
-		dev_err(dev, "qp(0x%lx) next rqwqe buf ba find failed\n",
-			hr_qp->qpn);
-		return false;
-	}
-
-	return true;
-}
-
-static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
-				 const struct ib_qp_attr *attr, int attr_mask,
-				 struct hns_roce_v2_qp_context *context,
-				 struct hns_roce_v2_qp_context *qpc_mask)
-{
-	const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
-	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
-	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
-	struct device *dev = hr_dev->dev;
 	u64 mtts[MTT_MIN_COUNT] = { 0 };
-	dma_addr_t dma_handle_3;
-	dma_addr_t dma_handle_2;
 	u64 wqe_sge_ba;
-	u32 page_size;
-	u8 port_num;
-	u64 *mtts_3;
-	u64 *mtts_2;
 	int count;
-	u8 *dmac;
-	u8 *smac;
-	int port;
 
 	/* Search qp buf's mtts */
-	page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
-	count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr,
-				  hr_qp->rq.offset / page_size, mtts,
+	count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, hr_qp->rq.offset, mtts,
 				  MTT_MIN_COUNT, &wqe_sge_ba);
-	if (!ibqp->srq)
-		if (!check_wqe_rq_mtt_count(hr_dev, hr_qp, count, page_size))
-			return -EINVAL;
-
-	/* Search IRRL's mtts */
-	mtts_2 = hns_roce_table_find(hr_dev, &hr_dev->qp_table.irrl_table,
-				     hr_qp->qpn, &dma_handle_2);
-	if (!mtts_2) {
-		dev_err(dev, "qp irrl_table find failed\n");
+	if (hr_qp->rq.wqe_cnt && count < 1) {
+		ibdev_err(&hr_dev->ib_dev,
+			  "failed to find RQ WQE, QPN = 0x%lx.\n", hr_qp->qpn);
 		return -EINVAL;
 	}
 
-	/* Search TRRL's mtts */
-	mtts_3 = hns_roce_table_find(hr_dev, &hr_dev->qp_table.trrl_table,
-				     hr_qp->qpn, &dma_handle_3);
-	if (!mtts_3) {
-		dev_err(dev, "qp trrl_table find failed\n");
-		return -EINVAL;
-	}
-
-	if (attr_mask & IB_QP_ALT_PATH) {
-		dev_err(dev, "INIT2RTR attr_mask (0x%x) error\n", attr_mask);
-		return -EINVAL;
-	}
-
-	dmac = (u8 *)attr->ah_attr.roce.dmac;
 	context->wqe_sge_ba = cpu_to_le32(wqe_sge_ba >> 3);
 	qpc_mask->wqe_sge_ba = 0;
 
@@ -3724,17 +3993,16 @@
 
 	roce_set_field(context->byte_12_sq_hop, V2_QPC_BYTE_12_SQ_HOP_NUM_M,
 		       V2_QPC_BYTE_12_SQ_HOP_NUM_S,
-		       hr_dev->caps.wqe_sq_hop_num == HNS_ROCE_HOP_NUM_0 ?
-		       0 : hr_dev->caps.wqe_sq_hop_num);
+		       to_hr_hem_hopnum(hr_dev->caps.wqe_sq_hop_num,
+					hr_qp->sq.wqe_cnt));
 	roce_set_field(qpc_mask->byte_12_sq_hop, V2_QPC_BYTE_12_SQ_HOP_NUM_M,
 		       V2_QPC_BYTE_12_SQ_HOP_NUM_S, 0);
 
 	roce_set_field(context->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_M,
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_S,
-		       ((ibqp->qp_type == IB_QPT_GSI) ||
-		       hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ?
-		       hr_dev->caps.wqe_sge_hop_num : 0);
+		       to_hr_hem_hopnum(hr_dev->caps.wqe_sge_hop_num,
+					hr_qp->sge.sge_cnt));
 	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_M,
 		       V2_QPC_BYTE_20_SGE_HOP_NUM_S, 0);
@@ -3742,8 +4010,9 @@
 	roce_set_field(context->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_RQ_HOP_NUM_M,
 		       V2_QPC_BYTE_20_RQ_HOP_NUM_S,
-		       hr_dev->caps.wqe_rq_hop_num == HNS_ROCE_HOP_NUM_0 ?
-		       0 : hr_dev->caps.wqe_rq_hop_num);
+		       to_hr_hem_hopnum(hr_dev->caps.wqe_rq_hop_num,
+					hr_qp->rq.wqe_cnt));
+
 	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
 		       V2_QPC_BYTE_20_RQ_HOP_NUM_M,
 		       V2_QPC_BYTE_20_RQ_HOP_NUM_S, 0);
@@ -3751,7 +4020,7 @@
 	roce_set_field(context->byte_16_buf_ba_pg_sz,
 		       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M,
 		       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S,
-		       hr_qp->wqe_bt_pg_shift + PG_SHIFT_OFFSET);
+		       to_hr_hw_page_shift(hr_qp->mtr.hem_cfg.ba_pg_shift));
 	roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz,
 		       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_M,
 		       V2_QPC_BYTE_16_WQE_SGE_BA_PG_SZ_S, 0);
@@ -3759,50 +4028,190 @@
 	roce_set_field(context->byte_16_buf_ba_pg_sz,
 		       V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_M,
 		       V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_S,
-		       hr_dev->caps.mtt_buf_pg_sz + PG_SHIFT_OFFSET);
+		       to_hr_hw_page_shift(hr_qp->mtr.hem_cfg.buf_pg_shift));
 	roce_set_field(qpc_mask->byte_16_buf_ba_pg_sz,
 		       V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_M,
 		       V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_S, 0);
 
-	context->rq_cur_blk_addr = cpu_to_le32(mtts[0] >> PAGE_ADDR_SHIFT);
+	context->rq_cur_blk_addr = cpu_to_le32(to_hr_hw_page_addr(mtts[0]));
 	qpc_mask->rq_cur_blk_addr = 0;
 
 	roce_set_field(context->byte_92_srq_info,
 		       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_M,
 		       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_S,
-		       mtts[0] >> (32 + PAGE_ADDR_SHIFT));
+		       upper_32_bits(to_hr_hw_page_addr(mtts[0])));
 	roce_set_field(qpc_mask->byte_92_srq_info,
 		       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_M,
 		       V2_QPC_BYTE_92_RQ_CUR_BLK_ADDR_S, 0);
 
-	context->rq_nxt_blk_addr = cpu_to_le32(mtts[1] >> PAGE_ADDR_SHIFT);
+	context->rq_nxt_blk_addr = cpu_to_le32(to_hr_hw_page_addr(mtts[1]));
 	qpc_mask->rq_nxt_blk_addr = 0;
 
 	roce_set_field(context->byte_104_rq_sge,
 		       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M,
 		       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S,
-		       mtts[1] >> (32 + PAGE_ADDR_SHIFT));
+		       upper_32_bits(to_hr_hw_page_addr(mtts[1])));
 	roce_set_field(qpc_mask->byte_104_rq_sge,
 		       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M,
 		       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S, 0);
 
+	roce_set_field(context->byte_84_rq_ci_pi,
+		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
+		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, hr_qp->rq.head);
+	roce_set_field(qpc_mask->byte_84_rq_ci_pi,
+		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
+		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
+
+	roce_set_field(qpc_mask->byte_84_rq_ci_pi,
+		       V2_QPC_BYTE_84_RQ_CONSUMER_IDX_M,
+		       V2_QPC_BYTE_84_RQ_CONSUMER_IDX_S, 0);
+
+	return 0;
+}
+
+static int config_qp_sq_buf(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_qp *hr_qp,
+			    struct hns_roce_v2_qp_context *context,
+			    struct hns_roce_v2_qp_context *qpc_mask)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	u64 sge_cur_blk = 0;
+	u64 sq_cur_blk = 0;
+	int count;
+
+	/* search qp buf's mtts */
+	count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, 0, &sq_cur_blk, 1, NULL);
+	if (count < 1) {
+		ibdev_err(ibdev, "failed to find QP(0x%lx) SQ buf.\n",
+			  hr_qp->qpn);
+		return -EINVAL;
+	}
+	if (hr_qp->sge.sge_cnt > 0) {
+		count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr,
+					  hr_qp->sge.offset,
+					  &sge_cur_blk, 1, NULL);
+		if (count < 1) {
+			ibdev_err(ibdev, "failed to find QP(0x%lx) SGE buf.\n",
+				  hr_qp->qpn);
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * In v2 engine, software pass context and context mask to hardware
+	 * when modifying qp. If software need modify some fields in context,
+	 * we should set all bits of the relevant fields in context mask to
+	 * 0 at the same time, else set them to 0x1.
+	 */
+	context->sq_cur_blk_addr = cpu_to_le32(to_hr_hw_page_addr(sq_cur_blk));
+	roce_set_field(context->byte_168_irrl_idx,
+		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M,
+		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S,
+		       upper_32_bits(to_hr_hw_page_addr(sq_cur_blk)));
+	qpc_mask->sq_cur_blk_addr = 0;
+	roce_set_field(qpc_mask->byte_168_irrl_idx,
+		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M,
+		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, 0);
+
+	context->sq_cur_sge_blk_addr =
+		cpu_to_le32(to_hr_hw_page_addr(sge_cur_blk));
+	roce_set_field(context->byte_184_irrl_idx,
+		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M,
+		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S,
+		       upper_32_bits(to_hr_hw_page_addr(sge_cur_blk)));
+	qpc_mask->sq_cur_sge_blk_addr = 0;
+	roce_set_field(qpc_mask->byte_184_irrl_idx,
+		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M,
+		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, 0);
+
+	context->rx_sq_cur_blk_addr =
+		cpu_to_le32(to_hr_hw_page_addr(sq_cur_blk));
+	roce_set_field(context->byte_232_irrl_sge,
+		       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M,
+		       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_S,
+		       upper_32_bits(to_hr_hw_page_addr(sq_cur_blk)));
+	qpc_mask->rx_sq_cur_blk_addr = 0;
+	roce_set_field(qpc_mask->byte_232_irrl_sge,
+		       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M,
+		       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_S, 0);
+
+	return 0;
+}
+
+static inline enum ib_mtu get_mtu(struct ib_qp *ibqp,
+				  const struct ib_qp_attr *attr)
+{
+	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_UD)
+		return IB_MTU_4096;
+
+	return attr->path_mtu;
+}
+
+static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
+				 const struct ib_qp_attr *attr, int attr_mask,
+				 struct hns_roce_v2_qp_context *context,
+				 struct hns_roce_v2_qp_context *qpc_mask)
+{
+	const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
+	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	dma_addr_t trrl_ba;
+	dma_addr_t irrl_ba;
+	enum ib_mtu mtu;
+	u8 lp_pktn_ini;
+	u8 port_num;
+	u64 *mtts;
+	u8 *dmac;
+	u8 *smac;
+	int port;
+	int ret;
+
+	ret = config_qp_rq_buf(hr_dev, hr_qp, context, qpc_mask);
+	if (ret) {
+		ibdev_err(ibdev, "failed to config rq buf, ret = %d.\n", ret);
+		return ret;
+	}
+
+	/* Search IRRL's mtts */
+	mtts = hns_roce_table_find(hr_dev, &hr_dev->qp_table.irrl_table,
+				   hr_qp->qpn, &irrl_ba);
+	if (!mtts) {
+		ibdev_err(ibdev, "failed to find qp irrl_table.\n");
+		return -EINVAL;
+	}
+
+	/* Search TRRL's mtts */
+	mtts = hns_roce_table_find(hr_dev, &hr_dev->qp_table.trrl_table,
+				   hr_qp->qpn, &trrl_ba);
+	if (!mtts) {
+		ibdev_err(ibdev, "failed to find qp trrl_table.\n");
+		return -EINVAL;
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		ibdev_err(ibdev, "INIT2RTR attr_mask (0x%x) error.\n",
+			  attr_mask);
+		return -EINVAL;
+	}
+
 	roce_set_field(context->byte_132_trrl, V2_QPC_BYTE_132_TRRL_BA_M,
-		       V2_QPC_BYTE_132_TRRL_BA_S, dma_handle_3 >> 4);
+		       V2_QPC_BYTE_132_TRRL_BA_S, trrl_ba >> 4);
 	roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_BA_M,
 		       V2_QPC_BYTE_132_TRRL_BA_S, 0);
-	context->trrl_ba = cpu_to_le32(dma_handle_3 >> (16 + 4));
+	context->trrl_ba = cpu_to_le32(trrl_ba >> (16 + 4));
 	qpc_mask->trrl_ba = 0;
 	roce_set_field(context->byte_140_raq, V2_QPC_BYTE_140_TRRL_BA_M,
 		       V2_QPC_BYTE_140_TRRL_BA_S,
-		       (u32)(dma_handle_3 >> (32 + 16 + 4)));
+		       (u32)(trrl_ba >> (32 + 16 + 4)));
 	roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_TRRL_BA_M,
 		       V2_QPC_BYTE_140_TRRL_BA_S, 0);
 
-	context->irrl_ba = cpu_to_le32(dma_handle_2 >> 6);
+	context->irrl_ba = cpu_to_le32(irrl_ba >> 6);
 	qpc_mask->irrl_ba = 0;
 	roce_set_field(context->byte_208_irrl, V2_QPC_BYTE_208_IRRL_BA_M,
 		       V2_QPC_BYTE_208_IRRL_BA_S,
-		       dma_handle_2 >> (32 + 6));
+		       irrl_ba >> (32 + 6));
 	roce_set_field(qpc_mask->byte_208_irrl, V2_QPC_BYTE_208_IRRL_BA_M,
 		       V2_QPC_BYTE_208_IRRL_BA_S, 0);
 
@@ -3817,6 +4226,7 @@
 	port = (attr_mask & IB_QP_PORT) ? (attr->port_num - 1) : hr_qp->port;
 
 	smac = (u8 *)hr_dev->dev_addr[port];
+	dmac = (u8 *)attr->ah_attr.roce.dmac;
 	/* when dmac equals smac or loop_idc is 1, it should loopback */
 	if (ether_addr_equal_unaligned(dmac, smac) ||
 	    hr_dev->loop_idc == 0x1) {
@@ -3834,13 +4244,12 @@
 	/* Configure GID index */
 	port_num = rdma_ah_get_port_num(&attr->ah_attr);
 	roce_set_field(context->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_SGID_IDX_M,
-		       V2_QPC_BYTE_20_SGID_IDX_S,
+		       V2_QPC_BYTE_20_SGID_IDX_M, V2_QPC_BYTE_20_SGID_IDX_S,
 		       hns_get_gid_index(hr_dev, port_num - 1,
 					 grh->sgid_index));
 	roce_set_field(qpc_mask->byte_20_smac_sgid_idx,
-		       V2_QPC_BYTE_20_SGID_IDX_M,
-		       V2_QPC_BYTE_20_SGID_IDX_S, 0);
+		       V2_QPC_BYTE_20_SGID_IDX_M, V2_QPC_BYTE_20_SGID_IDX_S, 0);
+
 	memcpy(&(context->dmac), dmac, sizeof(u32));
 	roce_set_field(context->byte_52_udpspn_dmac, V2_QPC_BYTE_52_DMAC_M,
 		       V2_QPC_BYTE_52_DMAC_S, *((u16 *)(&dmac[4])));
@@ -3848,32 +4257,32 @@
 	roce_set_field(qpc_mask->byte_52_udpspn_dmac, V2_QPC_BYTE_52_DMAC_M,
 		       V2_QPC_BYTE_52_DMAC_S, 0);
 
-	/* mtu*(2^LP_PKTN_INI) should not bigger than 1 message length 64kb */
+	mtu = get_mtu(ibqp, attr);
+	hr_qp->path_mtu = mtu;
+
+	if (attr_mask & IB_QP_PATH_MTU) {
+		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M,
+			       V2_QPC_BYTE_24_MTU_S, mtu);
+		roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M,
+			       V2_QPC_BYTE_24_MTU_S, 0);
+	}
+
+#define MAX_LP_MSG_LEN 65536
+	/* MTU * (2 ^ LP_PKTN_INI) shouldn't be bigger than 64KB */
+	lp_pktn_ini = ilog2(MAX_LP_MSG_LEN / ib_mtu_enum_to_int(mtu));
+
 	roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M,
-		       V2_QPC_BYTE_56_LP_PKTN_INI_S, 4);
+		       V2_QPC_BYTE_56_LP_PKTN_INI_S, lp_pktn_ini);
 	roce_set_field(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_LP_PKTN_INI_M,
 		       V2_QPC_BYTE_56_LP_PKTN_INI_S, 0);
 
-	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_UD)
-		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M,
-			       V2_QPC_BYTE_24_MTU_S, IB_MTU_4096);
-	else if (attr_mask & IB_QP_PATH_MTU)
-		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M,
-			       V2_QPC_BYTE_24_MTU_S, attr->path_mtu);
+	/* ACK_REQ_FREQ should be larger than or equal to LP_PKTN_INI */
+	roce_set_field(context->byte_172_sq_psn, V2_QPC_BYTE_172_ACK_REQ_FREQ_M,
+		       V2_QPC_BYTE_172_ACK_REQ_FREQ_S, lp_pktn_ini);
+	roce_set_field(qpc_mask->byte_172_sq_psn,
+		       V2_QPC_BYTE_172_ACK_REQ_FREQ_M,
+		       V2_QPC_BYTE_172_ACK_REQ_FREQ_S, 0);
 
-	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M,
-		       V2_QPC_BYTE_24_MTU_S, 0);
-
-	roce_set_field(context->byte_84_rq_ci_pi,
-		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
-		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, hr_qp->rq.head);
-	roce_set_field(qpc_mask->byte_84_rq_ci_pi,
-		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
-		       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
-
-	roce_set_field(qpc_mask->byte_84_rq_ci_pi,
-		       V2_QPC_BYTE_84_RQ_CONSUMER_IDX_M,
-		       V2_QPC_BYTE_84_RQ_CONSUMER_IDX_S, 0);
 	roce_set_bit(qpc_mask->byte_108_rx_reqepsn,
 		     V2_QPC_BYTE_108_RX_REQ_PSN_ERR_S, 0);
 	roce_set_field(qpc_mask->byte_96_rx_reqmsn, V2_QPC_BYTE_96_RX_REQ_MSN_M,
@@ -3908,80 +4317,20 @@
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
-	struct device *dev = hr_dev->dev;
-	u64 sge_cur_blk = 0;
-	u64 sq_cur_blk = 0;
-	u32 page_size;
-	int count;
-
-	/* Search qp buf's mtts */
-	count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr, 0, &sq_cur_blk, 1, NULL);
-	if (count < 1) {
-		dev_err(dev, "qp(0x%lx) buf pa find failed\n", hr_qp->qpn);
-		return -EINVAL;
-	}
-
-	if (hr_qp->sge.offset) {
-		page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
-		count = hns_roce_mtr_find(hr_dev, &hr_qp->mtr,
-					  hr_qp->sge.offset / page_size,
-					  &sge_cur_blk, 1, NULL);
-		if (count < 1) {
-			dev_err(dev, "qp(0x%lx) sge pa find failed\n",
-				hr_qp->qpn);
-			return -EINVAL;
-		}
-	}
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	int ret;
 
 	/* Not support alternate path and path migration */
-	if ((attr_mask & IB_QP_ALT_PATH) ||
-	    (attr_mask & IB_QP_PATH_MIG_STATE)) {
-		dev_err(dev, "RTR2RTS attr_mask (0x%x)error\n", attr_mask);
+	if (attr_mask & (IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE)) {
+		ibdev_err(ibdev, "RTR2RTS attr_mask (0x%x)error\n", attr_mask);
 		return -EINVAL;
 	}
 
-	/*
-	 * In v2 engine, software pass context and context mask to hardware
-	 * when modifying qp. If software need modify some fields in context,
-	 * we should set all bits of the relevant fields in context mask to
-	 * 0 at the same time, else set them to 0x1.
-	 */
-	context->sq_cur_blk_addr = cpu_to_le32(sq_cur_blk >> PAGE_ADDR_SHIFT);
-	roce_set_field(context->byte_168_irrl_idx,
-		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M,
-		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S,
-		       sq_cur_blk >> (32 + PAGE_ADDR_SHIFT));
-	qpc_mask->sq_cur_blk_addr = 0;
-	roce_set_field(qpc_mask->byte_168_irrl_idx,
-		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M,
-		       V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, 0);
-
-	context->sq_cur_sge_blk_addr = ((ibqp->qp_type == IB_QPT_GSI) ||
-		       hr_qp->sq.max_gs > HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ?
-		       cpu_to_le32(sge_cur_blk >>
-		       PAGE_ADDR_SHIFT) : 0;
-	roce_set_field(context->byte_184_irrl_idx,
-		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M,
-		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S,
-		       ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs >
-		       HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE) ?
-		       (sge_cur_blk >>
-		       (32 + PAGE_ADDR_SHIFT)) : 0);
-	qpc_mask->sq_cur_sge_blk_addr = 0;
-	roce_set_field(qpc_mask->byte_184_irrl_idx,
-		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M,
-		       V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, 0);
-
-	context->rx_sq_cur_blk_addr =
-		cpu_to_le32(sq_cur_blk >> PAGE_ADDR_SHIFT);
-	roce_set_field(context->byte_232_irrl_sge,
-		       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M,
-		       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_S,
-		       sq_cur_blk >> (32 + PAGE_ADDR_SHIFT));
-	qpc_mask->rx_sq_cur_blk_addr = 0;
-	roce_set_field(qpc_mask->byte_232_irrl_sge,
-		       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_M,
-		       V2_QPC_BYTE_232_RX_SQ_CUR_BLK_ADDR_S, 0);
+	ret = config_qp_sq_buf(hr_dev, hr_qp, context, qpc_mask);
+	if (ret) {
+		ibdev_err(ibdev, "failed to config sq buf, ret = %d.\n", ret);
+		return ret;
+	}
 
 	/*
 	 * Set some fields in context to zero, Because the default values
@@ -4030,19 +4379,12 @@
 	return 0;
 }
 
-static inline bool hns_roce_v2_check_qp_stat(enum ib_qp_state cur_state,
-					     enum ib_qp_state new_state)
+static inline u16 get_udp_sport(u32 fl, u32 lqpn, u32 rqpn)
 {
+	if (!fl)
+		fl = rdma_calc_flow_label(lqpn, rqpn);
 
-	if ((cur_state != IB_QPS_RESET &&
-	    (new_state == IB_QPS_ERR || new_state == IB_QPS_RESET)) ||
-	    ((cur_state == IB_QPS_RTS || cur_state == IB_QPS_SQD) &&
-	    (new_state == IB_QPS_RTS || new_state == IB_QPS_SQD)) ||
-	    (cur_state == IB_QPS_SQE && new_state == IB_QPS_RTS))
-		return true;
-
-	return false;
-
+	return rdma_flow_label_to_udp_sport(fl);
 }
 
 static int hns_roce_v2_set_path(struct ib_qp *ibqp,
@@ -4054,10 +4396,11 @@
 	const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	const struct ib_gid_attr *gid_attr = NULL;
 	int is_roce_protocol;
+	u16 vlan_id = 0xffff;
 	bool is_udp = false;
-	u16 vlan = 0xffff;
 	u8 ib_port;
 	u8 hr_port;
 	int ret;
@@ -4069,7 +4412,7 @@
 
 	if (is_roce_protocol) {
 		gid_attr = attr->ah_attr.grh.sgid_attr;
-		ret = rdma_read_gid_l2_fields(gid_attr, &vlan, NULL);
+		ret = rdma_read_gid_l2_fields(gid_attr, &vlan_id, NULL);
 		if (ret)
 			return ret;
 
@@ -4078,7 +4421,7 @@
 				 IB_GID_TYPE_ROCE_UDP_ENCAP);
 	}
 
-	if (vlan < VLAN_CFI_MASK) {
+	if (vlan_id < VLAN_N_VID) {
 		roce_set_bit(context->byte_76_srqn_op_en,
 			     V2_QPC_BYTE_76_RQ_VLAN_EN_S, 1);
 		roce_set_bit(qpc_mask->byte_76_srqn_op_en,
@@ -4090,24 +4433,25 @@
 	}
 
 	roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M,
-		       V2_QPC_BYTE_24_VLAN_ID_S, vlan);
+		       V2_QPC_BYTE_24_VLAN_ID_S, vlan_id);
 	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M,
 		       V2_QPC_BYTE_24_VLAN_ID_S, 0);
 
 	if (grh->sgid_index >= hr_dev->caps.gid_table_len[hr_port]) {
-		dev_err(hr_dev->dev, "sgid_index(%u) too large. max is %d\n",
-			grh->sgid_index, hr_dev->caps.gid_table_len[hr_port]);
+		ibdev_err(ibdev, "sgid_index(%u) too large. max is %d\n",
+			  grh->sgid_index, hr_dev->caps.gid_table_len[hr_port]);
 		return -EINVAL;
 	}
 
 	if (attr->ah_attr.type != RDMA_AH_ATTR_TYPE_ROCE) {
-		dev_err(hr_dev->dev, "ah attr is not RDMA roce type\n");
+		ibdev_err(ibdev, "ah attr is not RDMA roce type\n");
 		return -EINVAL;
 	}
 
 	roce_set_field(context->byte_52_udpspn_dmac, V2_QPC_BYTE_52_UDPSPN_M,
 		       V2_QPC_BYTE_52_UDPSPN_S,
-		       is_udp ? 0x12b7 : 0);
+		       is_udp ? get_udp_sport(grh->flow_label, ibqp->qp_num,
+					      attr->dest_qp_num) : 0);
 
 	roce_set_field(qpc_mask->byte_52_udpspn_dmac, V2_QPC_BYTE_52_UDPSPN_M,
 		       V2_QPC_BYTE_52_UDPSPN_S, 0);
@@ -4124,29 +4468,58 @@
 	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_HOP_LIMIT_M,
 		       V2_QPC_BYTE_24_HOP_LIMIT_S, 0);
 
-	if (hr_dev->pci_dev->revision == 0x21 && is_udp)
-		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
-			       V2_QPC_BYTE_24_TC_S, grh->traffic_class >> 2);
-	else
-		roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
-			       V2_QPC_BYTE_24_TC_S, grh->traffic_class);
+	roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
+		       V2_QPC_BYTE_24_TC_S, get_tclass(&attr->ah_attr.grh));
 	roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M,
 		       V2_QPC_BYTE_24_TC_S, 0);
+
 	roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_FL_M,
 		       V2_QPC_BYTE_28_FL_S, grh->flow_label);
 	roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_FL_M,
 		       V2_QPC_BYTE_28_FL_S, 0);
 	memcpy(context->dgid, grh->dgid.raw, sizeof(grh->dgid.raw));
 	memset(qpc_mask->dgid, 0, sizeof(grh->dgid.raw));
+
+	hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr);
+	if (unlikely(hr_qp->sl > MAX_SERVICE_LEVEL)) {
+		ibdev_err(ibdev,
+			  "failed to fill QPC, sl (%d) shouldn't be larger than %d.\n",
+			  hr_qp->sl, MAX_SERVICE_LEVEL);
+		return -EINVAL;
+	}
+
 	roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
-		       V2_QPC_BYTE_28_SL_S, rdma_ah_get_sl(&attr->ah_attr));
+		       V2_QPC_BYTE_28_SL_S, hr_qp->sl);
 	roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_SL_M,
 		       V2_QPC_BYTE_28_SL_S, 0);
-	hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr);
 
 	return 0;
 }
 
+static bool check_qp_state(enum ib_qp_state cur_state,
+			   enum ib_qp_state new_state)
+{
+	static const bool sm[][IB_QPS_ERR + 1] = {
+		[IB_QPS_RESET] = { [IB_QPS_RESET] = true,
+				   [IB_QPS_INIT] = true },
+		[IB_QPS_INIT] = { [IB_QPS_RESET] = true,
+				  [IB_QPS_INIT] = true,
+				  [IB_QPS_RTR] = true,
+				  [IB_QPS_ERR] = true },
+		[IB_QPS_RTR] = { [IB_QPS_RESET] = true,
+				 [IB_QPS_RTS] = true,
+				 [IB_QPS_ERR] = true },
+		[IB_QPS_RTS] = { [IB_QPS_RESET] = true,
+				 [IB_QPS_RTS] = true,
+				 [IB_QPS_ERR] = true },
+		[IB_QPS_SQD] = {},
+		[IB_QPS_SQE] = {},
+		[IB_QPS_ERR] = { [IB_QPS_RESET] = true, [IB_QPS_ERR] = true }
+	};
+
+	return sm[cur_state][new_state];
+}
+
 static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp,
 				      const struct ib_qp_attr *attr,
 				      int attr_mask,
@@ -4158,8 +4531,13 @@
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	int ret = 0;
 
+	if (!check_qp_state(cur_state, new_state)) {
+		ibdev_err(&hr_dev->ib_dev, "Illegal state for QP!\n");
+		return -EINVAL;
+	}
+
 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
-		memset(qpc_mask, 0, sizeof(*qpc_mask));
+		memset(qpc_mask, 0, hr_dev->caps.qpc_sz);
 		modify_qp_reset_to_init(ibqp, attr, attr_mask, context,
 					qpc_mask);
 	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) {
@@ -4168,23 +4546,11 @@
 	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
 		ret = modify_qp_init_to_rtr(ibqp, attr, attr_mask, context,
 					    qpc_mask);
-		if (ret)
-			goto out;
 	} else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) {
 		ret = modify_qp_rtr_to_rts(ibqp, attr, attr_mask, context,
 					   qpc_mask);
-		if (ret)
-			goto out;
-	} else if (hns_roce_v2_check_qp_stat(cur_state, new_state)) {
-		/* Nothing */
-		;
-	} else {
-		dev_err(hr_dev->dev, "Illegal state for QP!\n");
-		ret = -EINVAL;
-		goto out;
 	}
 
-out:
 	return ret;
 }
 
@@ -4214,8 +4580,8 @@
 				       V2_QPC_BYTE_28_AT_M, V2_QPC_BYTE_28_AT_S,
 				       0);
 		} else {
-			dev_warn(hr_dev->dev,
-				 "Local ACK timeout shall be 0 to 30.\n");
+			ibdev_warn(&hr_dev->ib_dev,
+				   "Local ACK timeout shall be 0 to 30.\n");
 		}
 	}
 
@@ -4230,8 +4596,7 @@
 
 		roce_set_field(context->byte_212_lsn,
 			       V2_QPC_BYTE_212_RETRY_CNT_M,
-			       V2_QPC_BYTE_212_RETRY_CNT_S,
-			       attr->retry_cnt);
+			       V2_QPC_BYTE_212_RETRY_CNT_S, attr->retry_cnt);
 		roce_set_field(qpc_mask->byte_212_lsn,
 			       V2_QPC_BYTE_212_RETRY_CNT_M,
 			       V2_QPC_BYTE_212_RETRY_CNT_S, 0);
@@ -4384,7 +4749,9 @@
 	struct hns_roce_v2_qp_context ctx[2];
 	struct hns_roce_v2_qp_context *context = ctx;
 	struct hns_roce_v2_qp_context *qpc_mask = ctx + 1;
-	struct device *dev = hr_dev->dev;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	unsigned long sq_flag = 0;
+	unsigned long rq_flag = 0;
 	int ret;
 
 	/*
@@ -4393,8 +4760,9 @@
 	 * we should set all bits of the relevant fields in context mask to
 	 * 0 at the same time, else set them to 0x1.
 	 */
-	memset(context, 0, sizeof(*context));
-	memset(qpc_mask, 0xff, sizeof(*qpc_mask));
+	memset(context, 0, hr_dev->caps.qpc_sz);
+	memset(qpc_mask, 0xff, hr_dev->caps.qpc_sz);
+
 	ret = hns_roce_v2_set_abs_fields(ibqp, attr, attr_mask, cur_state,
 					 new_state, context, qpc_mask);
 	if (ret)
@@ -4402,6 +4770,8 @@
 
 	/* When QP state is err, SQ and RQ WQE should be flushed */
 	if (new_state == IB_QPS_ERR) {
+		spin_lock_irqsave(&hr_qp->sq.lock, sq_flag);
+		hr_qp->state = IB_QPS_ERR;
 		roce_set_field(context->byte_160_sq_ci_pi,
 			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
 			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S,
@@ -4409,8 +4779,10 @@
 		roce_set_field(qpc_mask->byte_160_sq_ci_pi,
 			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_M,
 			       V2_QPC_BYTE_160_SQ_PRODUCER_IDX_S, 0);
+		spin_unlock_irqrestore(&hr_qp->sq.lock, sq_flag);
 
 		if (!ibqp->srq) {
+			spin_lock_irqsave(&hr_qp->rq.lock, rq_flag);
 			roce_set_field(context->byte_84_rq_ci_pi,
 			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
 			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S,
@@ -4418,6 +4790,7 @@
 			roce_set_field(qpc_mask->byte_84_rq_ci_pi,
 			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_M,
 			       V2_QPC_BYTE_84_RQ_PRODUCER_IDX_S, 0);
+			spin_unlock_irqrestore(&hr_qp->rq.lock, rq_flag);
 		}
 	}
 
@@ -4439,9 +4812,9 @@
 		       V2_QPC_BYTE_60_QP_ST_S, 0);
 
 	/* SW pass context to HW */
-	ret = hns_roce_v2_qp_modify(hr_dev, cur_state, new_state, ctx, hr_qp);
+	ret = hns_roce_v2_qp_modify(hr_dev, context, qpc_mask, hr_qp);
 	if (ret) {
-		dev_err(dev, "hns_roce_qp_modify failed(%d)\n", ret);
+		ibdev_err(ibdev, "failed to modify QP, ret = %d.\n", ret);
 		goto out;
 	}
 
@@ -4469,19 +4842,20 @@
 	return ret;
 }
 
-static inline enum ib_qp_state to_ib_qp_st(enum hns_roce_v2_qp_state state)
+static int to_ib_qp_st(enum hns_roce_v2_qp_state state)
 {
-	switch (state) {
-	case HNS_ROCE_QP_ST_RST:	return IB_QPS_RESET;
-	case HNS_ROCE_QP_ST_INIT:	return IB_QPS_INIT;
-	case HNS_ROCE_QP_ST_RTR:	return IB_QPS_RTR;
-	case HNS_ROCE_QP_ST_RTS:	return IB_QPS_RTS;
-	case HNS_ROCE_QP_ST_SQ_DRAINING:
-	case HNS_ROCE_QP_ST_SQD:	return IB_QPS_SQD;
-	case HNS_ROCE_QP_ST_SQER:	return IB_QPS_SQE;
-	case HNS_ROCE_QP_ST_ERR:	return IB_QPS_ERR;
-	default:			return -1;
-	}
+	static const enum ib_qp_state map[] = {
+		[HNS_ROCE_QP_ST_RST] = IB_QPS_RESET,
+		[HNS_ROCE_QP_ST_INIT] = IB_QPS_INIT,
+		[HNS_ROCE_QP_ST_RTR] = IB_QPS_RTR,
+		[HNS_ROCE_QP_ST_RTS] = IB_QPS_RTS,
+		[HNS_ROCE_QP_ST_SQD] = IB_QPS_SQD,
+		[HNS_ROCE_QP_ST_SQER] = IB_QPS_SQE,
+		[HNS_ROCE_QP_ST_ERR] = IB_QPS_ERR,
+		[HNS_ROCE_QP_ST_SQ_DRAINING] = IB_QPS_SQD
+	};
+
+	return (state < ARRAY_SIZE(map)) ? map[state] : -1;
 }
 
 static int hns_roce_v2_query_qpc(struct hns_roce_dev *hr_dev,
@@ -4498,12 +4872,10 @@
 	ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, hr_qp->qpn, 0,
 				HNS_ROCE_CMD_QUERY_QPC,
 				HNS_ROCE_CMD_TIMEOUT_MSECS);
-	if (ret) {
-		dev_err(hr_dev->dev, "QUERY QP cmd process error\n");
+	if (ret)
 		goto out;
-	}
 
-	memcpy(hr_context, mailbox->buf, sizeof(*hr_context));
+	memcpy(hr_context, mailbox->buf, hr_dev->caps.qpc_sz);
 
 out:
 	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
@@ -4517,7 +4889,7 @@
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
 	struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
 	struct hns_roce_v2_qp_context context = {};
-	struct device *dev = hr_dev->dev;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	int tmp_qp_state;
 	int state;
 	int ret;
@@ -4535,7 +4907,7 @@
 
 	ret = hns_roce_v2_query_qpc(hr_dev, hr_qp, &context);
 	if (ret) {
-		dev_err(dev, "query qpc error\n");
+		ibdev_err(ibdev, "failed to query QPC, ret = %d.\n", ret);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -4544,7 +4916,7 @@
 			       V2_QPC_BYTE_60_QP_ST_M, V2_QPC_BYTE_60_QP_ST_S);
 	tmp_qp_state = to_ib_qp_st((enum hns_roce_v2_qp_state)state);
 	if (tmp_qp_state == -1) {
-		dev_err(dev, "Illegal ib_qp_state\n");
+		ibdev_err(ibdev, "Illegal ib_qp_state\n");
 		ret = -EINVAL;
 		goto out;
 	}
@@ -4645,8 +5017,9 @@
 					 struct hns_roce_qp *hr_qp,
 					 struct ib_udata *udata)
 {
-	struct hns_roce_cq *send_cq, *recv_cq;
 	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_cq *send_cq, *recv_cq;
+	unsigned long flags;
 	int ret = 0;
 
 	if (hr_qp->ibqp.qp_type == IB_QPT_RC && hr_qp->state != IB_QPS_RESET) {
@@ -4654,61 +5027,33 @@
 		ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0,
 					    hr_qp->state, IB_QPS_RESET);
 		if (ret)
-			ibdev_err(ibdev, "modify QP to Reset failed.\n");
+			ibdev_err(ibdev,
+				  "failed to modify QP to RST, ret = %d.\n",
+				  ret);
 	}
 
-	send_cq = to_hr_cq(hr_qp->ibqp.send_cq);
-	recv_cq = to_hr_cq(hr_qp->ibqp.recv_cq);
+	send_cq = hr_qp->ibqp.send_cq ? to_hr_cq(hr_qp->ibqp.send_cq) : NULL;
+	recv_cq = hr_qp->ibqp.recv_cq ? to_hr_cq(hr_qp->ibqp.recv_cq) : NULL;
 
+	spin_lock_irqsave(&hr_dev->qp_list_lock, flags);
 	hns_roce_lock_cqs(send_cq, recv_cq);
 
 	if (!udata) {
-		__hns_roce_v2_cq_clean(recv_cq, hr_qp->qpn, hr_qp->ibqp.srq ?
-				       to_hr_srq(hr_qp->ibqp.srq) : NULL);
-		if (send_cq != recv_cq)
+		if (recv_cq)
+			__hns_roce_v2_cq_clean(recv_cq, hr_qp->qpn,
+					       (hr_qp->ibqp.srq ?
+						to_hr_srq(hr_qp->ibqp.srq) :
+						NULL));
+
+		if (send_cq && send_cq != recv_cq)
 			__hns_roce_v2_cq_clean(send_cq, hr_qp->qpn, NULL);
+
 	}
 
 	hns_roce_qp_remove(hr_dev, hr_qp);
 
 	hns_roce_unlock_cqs(send_cq, recv_cq);
-
-	hns_roce_qp_free(hr_dev, hr_qp);
-
-	/* Not special_QP, free their QPN */
-	if ((hr_qp->ibqp.qp_type == IB_QPT_RC) ||
-	    (hr_qp->ibqp.qp_type == IB_QPT_UC) ||
-	    (hr_qp->ibqp.qp_type == IB_QPT_UD))
-		hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
-
-	hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr);
-
-	if (udata) {
-		struct hns_roce_ucontext *context =
-			rdma_udata_to_drv_context(
-				udata,
-				struct hns_roce_ucontext,
-				ibucontext);
-
-		if (hr_qp->sq.wqe_cnt && (hr_qp->sdb_en == 1))
-			hns_roce_db_unmap_user(context, &hr_qp->sdb);
-
-		if (hr_qp->rq.wqe_cnt && (hr_qp->rdb_en == 1))
-			hns_roce_db_unmap_user(context, &hr_qp->rdb);
-	} else {
-		kfree(hr_qp->sq.wrid);
-		kfree(hr_qp->rq.wrid);
-		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
-		if (hr_qp->rq.wqe_cnt)
-			hns_roce_free_db(hr_dev, &hr_qp->rdb);
-	}
-	ib_umem_release(hr_qp->umem);
-
-	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
-	     hr_qp->rq.wqe_cnt) {
-		kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
-		kfree(hr_qp->rq_inl_buf.wqe_list);
-	}
+	spin_unlock_irqrestore(&hr_dev->qp_list_lock, flags);
 
 	return ret;
 }
@@ -4721,20 +5066,19 @@
 
 	ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata);
 	if (ret)
-		ibdev_err(&hr_dev->ib_dev, "Destroy qp 0x%06lx failed(%d)\n",
+		ibdev_err(&hr_dev->ib_dev,
+			  "failed to destroy QP, QPN = 0x%06lx, ret = %d.\n",
 			  hr_qp->qpn, ret);
 
-	if (hr_qp->ibqp.qp_type == IB_QPT_GSI)
-		kfree(hr_to_hr_sqp(hr_qp));
-	else
-		kfree(hr_qp);
+	hns_roce_qp_destroy(hr_dev, hr_qp, udata);
 
 	return 0;
 }
 
 static int hns_roce_v2_qp_flow_control_init(struct hns_roce_dev *hr_dev,
-						struct hns_roce_qp *hr_qp)
+					    struct hns_roce_qp *hr_qp)
 {
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_sccc_clr_done *resp;
 	struct hns_roce_sccc_clr *clr;
 	struct hns_roce_cmq_desc desc;
@@ -4746,7 +5090,7 @@
 	hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_RESET_SCCC, false);
 	ret =  hns_roce_cmq_send(hr_dev, &desc, 1);
 	if (ret) {
-		dev_err(hr_dev->dev, "Reset SCC ctx  failed(%d)\n", ret);
+		ibdev_err(ibdev, "failed to reset SCC ctx, ret = %d.\n", ret);
 		goto out;
 	}
 
@@ -4756,7 +5100,7 @@
 	clr->qpn = cpu_to_le32(hr_qp->qpn);
 	ret =  hns_roce_cmq_send(hr_dev, &desc, 1);
 	if (ret) {
-		dev_err(hr_dev->dev, "Clear SCC ctx failed(%d)\n", ret);
+		ibdev_err(ibdev, "failed to clear SCC ctx, ret = %d.\n", ret);
 		goto out;
 	}
 
@@ -4767,7 +5111,8 @@
 					      HNS_ROCE_OPC_QUERY_SCCC, true);
 		ret = hns_roce_cmq_send(hr_dev, &desc, 1);
 		if (ret) {
-			dev_err(hr_dev->dev, "Query clr cmq failed(%d)\n", ret);
+			ibdev_err(ibdev, "failed to query clr cmq, ret = %d\n",
+				  ret);
 			goto out;
 		}
 
@@ -4777,7 +5122,7 @@
 		msleep(20);
 	}
 
-	dev_err(hr_dev->dev, "Query SCC clr done flag overtime.\n");
+	ibdev_err(ibdev, "Query SCC clr done flag overtime.\n");
 	ret = -ETIMEDOUT;
 
 out:
@@ -4785,6 +5130,188 @@
 	return ret;
 }
 
+static void hns_roce_v2_write_srqc(struct hns_roce_dev *hr_dev,
+				   struct hns_roce_srq *srq, u32 pdn, u16 xrcd,
+				   u32 cqn, void *mb_buf, u64 *mtts_wqe,
+				   u64 *mtts_idx, dma_addr_t dma_handle_wqe,
+				   dma_addr_t dma_handle_idx)
+{
+	struct hns_roce_srq_context *srq_context;
+
+	srq_context = mb_buf;
+	memset(srq_context, 0, sizeof(*srq_context));
+
+	roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQ_ST_M,
+		       SRQC_BYTE_4_SRQ_ST_S, 1);
+
+	roce_set_field(srq_context->byte_4_srqn_srqst,
+		       SRQC_BYTE_4_SRQ_WQE_HOP_NUM_M,
+		       SRQC_BYTE_4_SRQ_WQE_HOP_NUM_S,
+		       to_hr_hem_hopnum(hr_dev->caps.srqwqe_hop_num,
+					srq->wqe_cnt));
+	roce_set_field(srq_context->byte_4_srqn_srqst,
+		       SRQC_BYTE_4_SRQ_SHIFT_M, SRQC_BYTE_4_SRQ_SHIFT_S,
+		       ilog2(srq->wqe_cnt));
+
+	roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQN_M,
+		       SRQC_BYTE_4_SRQN_S, srq->srqn);
+
+	roce_set_field(srq_context->byte_8_limit_wl, SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+		       SRQC_BYTE_8_SRQ_LIMIT_WL_S, 0);
+
+	roce_set_field(srq_context->byte_12_xrcd, SRQC_BYTE_12_SRQ_XRCD_M,
+		       SRQC_BYTE_12_SRQ_XRCD_S, xrcd);
+
+	srq_context->wqe_bt_ba = cpu_to_le32((u32)(dma_handle_wqe >> 3));
+
+	roce_set_field(srq_context->byte_24_wqe_bt_ba,
+		       SRQC_BYTE_24_SRQ_WQE_BT_BA_M,
+		       SRQC_BYTE_24_SRQ_WQE_BT_BA_S,
+		       dma_handle_wqe >> 35);
+
+	roce_set_field(srq_context->byte_28_rqws_pd, SRQC_BYTE_28_PD_M,
+		       SRQC_BYTE_28_PD_S, pdn);
+	roce_set_field(srq_context->byte_28_rqws_pd, SRQC_BYTE_28_RQWS_M,
+		       SRQC_BYTE_28_RQWS_S, srq->max_gs <= 0 ? 0 :
+		       fls(srq->max_gs - 1));
+
+	srq_context->idx_bt_ba = cpu_to_le32(dma_handle_idx >> 3);
+	roce_set_field(srq_context->rsv_idx_bt_ba,
+		       SRQC_BYTE_36_SRQ_IDX_BT_BA_M,
+		       SRQC_BYTE_36_SRQ_IDX_BT_BA_S,
+		       dma_handle_idx >> 35);
+
+	srq_context->idx_cur_blk_addr =
+		cpu_to_le32(to_hr_hw_page_addr(mtts_idx[0]));
+	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+		       SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_M,
+		       SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_S,
+		       upper_32_bits(to_hr_hw_page_addr(mtts_idx[0])));
+	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+		       SRQC_BYTE_44_SRQ_IDX_HOP_NUM_M,
+		       SRQC_BYTE_44_SRQ_IDX_HOP_NUM_S,
+		       to_hr_hem_hopnum(hr_dev->caps.idx_hop_num,
+					srq->wqe_cnt));
+
+	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+		       SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_M,
+		       SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_S,
+		to_hr_hw_page_shift(srq->idx_que.mtr.hem_cfg.ba_pg_shift));
+	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
+		       SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_M,
+		       SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_S,
+		to_hr_hw_page_shift(srq->idx_que.mtr.hem_cfg.buf_pg_shift));
+
+	srq_context->idx_nxt_blk_addr =
+				cpu_to_le32(to_hr_hw_page_addr(mtts_idx[1]));
+	roce_set_field(srq_context->rsv_idxnxtblkaddr,
+		       SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_M,
+		       SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_S,
+		       upper_32_bits(to_hr_hw_page_addr(mtts_idx[1])));
+	roce_set_field(srq_context->byte_56_xrc_cqn,
+		       SRQC_BYTE_56_SRQ_XRC_CQN_M, SRQC_BYTE_56_SRQ_XRC_CQN_S,
+		       cqn);
+	roce_set_field(srq_context->byte_56_xrc_cqn,
+		       SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_M,
+		       SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_S,
+		       to_hr_hw_page_shift(srq->buf_mtr.hem_cfg.ba_pg_shift));
+	roce_set_field(srq_context->byte_56_xrc_cqn,
+		       SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_M,
+		       SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_S,
+		       to_hr_hw_page_shift(srq->buf_mtr.hem_cfg.buf_pg_shift));
+
+	roce_set_bit(srq_context->db_record_addr_record_en,
+		     SRQC_BYTE_60_SRQ_RECORD_EN_S, 0);
+}
+
+static int hns_roce_v2_modify_srq(struct ib_srq *ibsrq,
+				  struct ib_srq_attr *srq_attr,
+				  enum ib_srq_attr_mask srq_attr_mask,
+				  struct ib_udata *udata)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
+	struct hns_roce_srq *srq = to_hr_srq(ibsrq);
+	struct hns_roce_srq_context *srq_context;
+	struct hns_roce_srq_context *srqc_mask;
+	struct hns_roce_cmd_mailbox *mailbox;
+	int ret;
+
+	/* Resizing SRQs is not supported yet */
+	if (srq_attr_mask & IB_SRQ_MAX_WR)
+		return -EINVAL;
+
+	if (srq_attr_mask & IB_SRQ_LIMIT) {
+		if (srq_attr->srq_limit >= srq->wqe_cnt)
+			return -EINVAL;
+
+		mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+		if (IS_ERR(mailbox))
+			return PTR_ERR(mailbox);
+
+		srq_context = mailbox->buf;
+		srqc_mask = (struct hns_roce_srq_context *)mailbox->buf + 1;
+
+		memset(srqc_mask, 0xff, sizeof(*srqc_mask));
+
+		roce_set_field(srq_context->byte_8_limit_wl,
+			       SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+			       SRQC_BYTE_8_SRQ_LIMIT_WL_S, srq_attr->srq_limit);
+		roce_set_field(srqc_mask->byte_8_limit_wl,
+			       SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+			       SRQC_BYTE_8_SRQ_LIMIT_WL_S, 0);
+
+		ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, srq->srqn, 0,
+					HNS_ROCE_CMD_MODIFY_SRQC,
+					HNS_ROCE_CMD_TIMEOUT_MSECS);
+		hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+		if (ret) {
+			ibdev_err(&hr_dev->ib_dev,
+				  "failed to handle cmd of modifying SRQ, ret = %d.\n",
+				  ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
+	struct hns_roce_srq *srq = to_hr_srq(ibsrq);
+	struct hns_roce_srq_context *srq_context;
+	struct hns_roce_cmd_mailbox *mailbox;
+	int limit_wl;
+	int ret;
+
+	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	srq_context = mailbox->buf;
+	ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, srq->srqn, 0,
+				HNS_ROCE_CMD_QUERY_SRQC,
+				HNS_ROCE_CMD_TIMEOUT_MSECS);
+	if (ret) {
+		ibdev_err(&hr_dev->ib_dev,
+			  "failed to process cmd of querying SRQ, ret = %d.\n",
+			  ret);
+		goto out;
+	}
+
+	limit_wl = roce_get_field(srq_context->byte_8_limit_wl,
+				  SRQC_BYTE_8_SRQ_LIMIT_WL_M,
+				  SRQC_BYTE_8_SRQ_LIMIT_WL_S);
+
+	attr->srq_limit = limit_wl;
+	attr->max_wr = srq->wqe_cnt - 1;
+	attr->max_sge = srq->max_gs;
+
+out:
+	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+	return ret;
+}
+
 static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(cq->device);
@@ -4821,99 +5348,65 @@
 				HNS_ROCE_CMD_TIMEOUT_MSECS);
 	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
 	if (ret)
-		dev_err(hr_dev->dev, "MODIFY CQ Failed to cmd mailbox.\n");
+		ibdev_err(&hr_dev->ib_dev,
+			  "failed to process cmd when modifying CQ, ret = %d.\n",
+			  ret);
 
 	return ret;
 }
 
-static void hns_roce_set_qps_to_err(struct hns_roce_dev *hr_dev, u32 qpn)
-{
-	struct hns_roce_qp *hr_qp;
-	struct ib_qp_attr attr;
-	int attr_mask;
-	int ret;
-
-	hr_qp = __hns_roce_qp_lookup(hr_dev, qpn);
-	if (!hr_qp) {
-		dev_warn(hr_dev->dev, "no hr_qp can be found!\n");
-		return;
-	}
-
-	if (hr_qp->ibqp.uobject) {
-		if (hr_qp->sdb_en == 1) {
-			hr_qp->sq.head = *(int *)(hr_qp->sdb.virt_addr);
-			if (hr_qp->rdb_en == 1)
-				hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr);
-		} else {
-			dev_warn(hr_dev->dev, "flush cqe is unsupported in userspace!\n");
-			return;
-		}
-	}
-
-	attr_mask = IB_QP_STATE;
-	attr.qp_state = IB_QPS_ERR;
-	ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, &attr, attr_mask,
-				    hr_qp->state, IB_QPS_ERR);
-	if (ret)
-		dev_err(hr_dev->dev, "failed to modify qp %d to err state.\n",
-			qpn);
-}
-
 static void hns_roce_irq_work_handle(struct work_struct *work)
 {
 	struct hns_roce_work *irq_work =
 				container_of(work, struct hns_roce_work, work);
-	struct device *dev = irq_work->hr_dev->dev;
+	struct ib_device *ibdev = &irq_work->hr_dev->ib_dev;
 	u32 qpn = irq_work->qpn;
 	u32 cqn = irq_work->cqn;
 
 	switch (irq_work->event_type) {
 	case HNS_ROCE_EVENT_TYPE_PATH_MIG:
-		dev_info(dev, "Path migrated succeeded.\n");
+		ibdev_info(ibdev, "Path migrated succeeded.\n");
 		break;
 	case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED:
-		dev_warn(dev, "Path migration failed.\n");
+		ibdev_warn(ibdev, "Path migration failed.\n");
 		break;
 	case HNS_ROCE_EVENT_TYPE_COMM_EST:
 		break;
 	case HNS_ROCE_EVENT_TYPE_SQ_DRAINED:
-		dev_warn(dev, "Send queue drained.\n");
+		ibdev_warn(ibdev, "Send queue drained.\n");
 		break;
 	case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR:
-		dev_err(dev, "Local work queue 0x%x catas error, sub_type:%d\n",
-			qpn, irq_work->sub_type);
-		hns_roce_set_qps_to_err(irq_work->hr_dev, qpn);
+		ibdev_err(ibdev, "Local work queue 0x%x catast error, sub_event type is: %d\n",
+			  qpn, irq_work->sub_type);
 		break;
 	case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR:
-		dev_err(dev, "Invalid request local work queue 0x%x error.\n",
-			qpn);
-		hns_roce_set_qps_to_err(irq_work->hr_dev, qpn);
+		ibdev_err(ibdev, "Invalid request local work queue 0x%x error.\n",
+			  qpn);
 		break;
 	case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR:
-		dev_err(dev, "Local access violation work queue 0x%x error, sub_type:%d\n",
-			qpn, irq_work->sub_type);
-		hns_roce_set_qps_to_err(irq_work->hr_dev, qpn);
+		ibdev_err(ibdev, "Local access violation work queue 0x%x error, sub_event type is: %d\n",
+			  qpn, irq_work->sub_type);
 		break;
 	case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH:
-		dev_warn(dev, "SRQ limit reach.\n");
+		ibdev_warn(ibdev, "SRQ limit reach.\n");
 		break;
 	case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH:
-		dev_warn(dev, "SRQ last wqe reach.\n");
+		ibdev_warn(ibdev, "SRQ last wqe reach.\n");
 		break;
 	case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR:
-		dev_err(dev, "SRQ catas error.\n");
+		ibdev_err(ibdev, "SRQ catas error.\n");
 		break;
 	case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR:
-		dev_err(dev, "CQ 0x%x access err.\n", cqn);
+		ibdev_err(ibdev, "CQ 0x%x access err.\n", cqn);
 		break;
 	case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW:
-		dev_warn(dev, "CQ 0x%x overflow\n", cqn);
+		ibdev_warn(ibdev, "CQ 0x%x overflow\n", cqn);
 		break;
 	case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW:
-		dev_warn(dev, "DB overflow.\n");
+		ibdev_warn(ibdev, "DB overflow.\n");
 		break;
 	case HNS_ROCE_EVENT_TYPE_FLR:
-		dev_warn(dev, "Function level reset.\n");
+		ibdev_warn(ibdev, "Function level reset.\n");
 		break;
 	default:
 		break;
@@ -4944,10 +5437,7 @@
 static void set_eq_cons_index_v2(struct hns_roce_eq *eq)
 {
 	struct hns_roce_dev *hr_dev = eq->hr_dev;
-	__le32 doorbell[2];
-
-	doorbell[0] = 0;
-	doorbell[1] = 0;
+	__le32 doorbell[2] = {};
 
 	if (eq->type_flag == HNS_ROCE_AEQ) {
 		roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_CMD_M,
@@ -4973,43 +5463,13 @@
 	hns_roce_write64(hr_dev, doorbell, eq->doorbell);
 }
 
-static struct hns_roce_aeqe *get_aeqe_v2(struct hns_roce_eq *eq, u32 entry)
-{
-	u32 buf_chk_sz;
-	unsigned long off;
-
-	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
-	off = (entry & (eq->entries - 1)) * HNS_ROCE_AEQ_ENTRY_SIZE;
-
-	return (struct hns_roce_aeqe *)((char *)(eq->buf_list->buf) +
-		off % buf_chk_sz);
-}
-
-static struct hns_roce_aeqe *mhop_get_aeqe(struct hns_roce_eq *eq, u32 entry)
-{
-	u32 buf_chk_sz;
-	unsigned long off;
-
-	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
-
-	off = (entry & (eq->entries - 1)) * HNS_ROCE_AEQ_ENTRY_SIZE;
-
-	if (eq->hop_num == HNS_ROCE_HOP_NUM_0)
-		return (struct hns_roce_aeqe *)((u8 *)(eq->bt_l0) +
-			off % buf_chk_sz);
-	else
-		return (struct hns_roce_aeqe *)((u8 *)
-			(eq->buf[off / buf_chk_sz]) + off % buf_chk_sz);
-}
-
 static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq)
 {
 	struct hns_roce_aeqe *aeqe;
 
-	if (!eq->hop_num)
-		aeqe = get_aeqe_v2(eq, eq->cons_index);
-	else
-		aeqe = mhop_get_aeqe(eq, eq->cons_index);
+	aeqe = hns_roce_buf_offset(eq->mtr.kmem,
+				   (eq->cons_index & (eq->entries - 1)) *
+				   eq->eqe_size);
 
 	return (roce_get_bit(aeqe->asyn, HNS_ROCE_V2_AEQ_AEQE_OWNER_S) ^
 		!!(eq->cons_index & eq->entries)) ? aeqe : NULL;
@@ -5103,43 +5563,13 @@
 	return aeqe_found;
 }
 
-static struct hns_roce_ceqe *get_ceqe_v2(struct hns_roce_eq *eq, u32 entry)
-{
-	u32 buf_chk_sz;
-	unsigned long off;
-
-	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
-	off = (entry & (eq->entries - 1)) * HNS_ROCE_CEQ_ENTRY_SIZE;
-
-	return (struct hns_roce_ceqe *)((char *)(eq->buf_list->buf) +
-		off % buf_chk_sz);
-}
-
-static struct hns_roce_ceqe *mhop_get_ceqe(struct hns_roce_eq *eq, u32 entry)
-{
-	u32 buf_chk_sz;
-	unsigned long off;
-
-	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
-
-	off = (entry & (eq->entries - 1)) * HNS_ROCE_CEQ_ENTRY_SIZE;
-
-	if (eq->hop_num == HNS_ROCE_HOP_NUM_0)
-		return (struct hns_roce_ceqe *)((u8 *)(eq->bt_l0) +
-			off % buf_chk_sz);
-	else
-		return (struct hns_roce_ceqe *)((u8 *)(eq->buf[off /
-			buf_chk_sz]) + off % buf_chk_sz);
-}
-
 static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq)
 {
 	struct hns_roce_ceqe *ceqe;
 
-	if (!eq->hop_num)
-		ceqe = get_ceqe_v2(eq, eq->cons_index);
-	else
-		ceqe = mhop_get_ceqe(eq, eq->cons_index);
+	ceqe = hns_roce_buf_offset(eq->mtr.kmem,
+				   (eq->cons_index & (eq->entries - 1)) *
+				   eq->eqe_size);
 
 	return (!!(roce_get_bit(ceqe->comp, HNS_ROCE_V2_CEQ_CEQE_OWNER_S))) ^
 		(!!(eq->cons_index & eq->entries)) ? ceqe : NULL;
@@ -5148,7 +5578,6 @@
 static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev,
 			       struct hns_roce_eq *eq)
 {
-	struct device *dev = hr_dev->dev;
 	struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq);
 	int ceqe_found = 0;
 	u32 cqn;
@@ -5159,8 +5588,7 @@
 		 */
 		dma_rmb();
 
-		cqn = roce_get_field(ceqe->comp,
-				     HNS_ROCE_V2_CEQE_COMP_CQN_M,
+		cqn = roce_get_field(ceqe->comp, HNS_ROCE_V2_CEQE_COMP_CQN_M,
 				     HNS_ROCE_V2_CEQE_COMP_CQN_S);
 
 		hns_roce_cq_completion(hr_dev, cqn);
@@ -5168,10 +5596,8 @@
 		++eq->cons_index;
 		ceqe_found = 1;
 
-		if (eq->cons_index > (EQ_DEPTH_COEFF * eq->entries - 1)) {
-			dev_warn(dev, "cons_index overflow, set back to 0.\n");
+		if (eq->cons_index > (EQ_DEPTH_COEFF * eq->entries - 1))
 			eq->cons_index = 0;
-		}
 
 		ceqe = next_ceqe_sw_v2(eq);
 	}
@@ -5185,7 +5611,7 @@
 {
 	struct hns_roce_eq *eq = eq_ptr;
 	struct hns_roce_dev *hr_dev = eq->hr_dev;
-	int int_work = 0;
+	int int_work;
 
 	if (eq->type_flag == HNS_ROCE_CEQ)
 		/* Completion event interrupt */
@@ -5301,494 +5727,187 @@
 		dev_err(dev, "[mailbox cmd] destroy eqc(%d) failed.\n", eqn);
 }
 
-static void hns_roce_mhop_free_eq(struct hns_roce_dev *hr_dev,
-				  struct hns_roce_eq *eq)
+static void free_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq)
 {
-	struct device *dev = hr_dev->dev;
-	u64 idx;
-	u64 size;
-	u32 buf_chk_sz;
-	u32 bt_chk_sz;
-	u32 mhop_num;
-	int eqe_alloc;
-	int i = 0;
-	int j = 0;
-
-	mhop_num = hr_dev->caps.eqe_hop_num;
-	buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT);
-	bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT);
-
-	if (mhop_num == HNS_ROCE_HOP_NUM_0) {
-		dma_free_coherent(dev, (unsigned int)(eq->entries *
-				  eq->eqe_size), eq->bt_l0, eq->l0_dma);
-		return;
-	}
-
-	dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma);
-	if (mhop_num == 1) {
-		for (i = 0; i < eq->l0_last_num; i++) {
-			if (i == eq->l0_last_num - 1) {
-				eqe_alloc = i * (buf_chk_sz / eq->eqe_size);
-				size = (eq->entries - eqe_alloc) * eq->eqe_size;
-				dma_free_coherent(dev, size, eq->buf[i],
-						  eq->buf_dma[i]);
-				break;
-			}
-			dma_free_coherent(dev, buf_chk_sz, eq->buf[i],
-					  eq->buf_dma[i]);
-		}
-	} else if (mhop_num == 2) {
-		for (i = 0; i < eq->l0_last_num; i++) {
-			dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
-					  eq->l1_dma[i]);
-
-			for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
-				idx = i * (bt_chk_sz / BA_BYTE_LEN) + j;
-				if ((i == eq->l0_last_num - 1)
-				     && j == eq->l1_last_num - 1) {
-					eqe_alloc = (buf_chk_sz / eq->eqe_size)
-						    * idx;
-					size = (eq->entries - eqe_alloc)
-						* eq->eqe_size;
-					dma_free_coherent(dev, size,
-							  eq->buf[idx],
-							  eq->buf_dma[idx]);
-					break;
-				}
-				dma_free_coherent(dev, buf_chk_sz, eq->buf[idx],
-						  eq->buf_dma[idx]);
-			}
-		}
-	}
-	kfree(eq->buf_dma);
-	kfree(eq->buf);
-	kfree(eq->l1_dma);
-	kfree(eq->bt_l1);
-	eq->buf_dma = NULL;
-	eq->buf = NULL;
-	eq->l1_dma = NULL;
-	eq->bt_l1 = NULL;
+	hns_roce_mtr_destroy(hr_dev, &eq->mtr);
 }
 
-static void hns_roce_v2_free_eq(struct hns_roce_dev *hr_dev,
-				struct hns_roce_eq *eq)
+static int config_eqc(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq,
+		      void *mb_buf)
 {
-	u32 buf_chk_sz;
-
-	buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT);
-
-	if (hr_dev->caps.eqe_hop_num) {
-		hns_roce_mhop_free_eq(hr_dev, eq);
-		return;
-	}
-
-	dma_free_coherent(hr_dev->dev, buf_chk_sz, eq->buf_list->buf,
-			  eq->buf_list->map);
-	kfree(eq->buf_list);
-}
-
-static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev,
-				struct hns_roce_eq *eq,
-				void *mb_buf)
-{
+	u64 eqe_ba[MTT_MIN_COUNT] = { 0 };
 	struct hns_roce_eq_context *eqc;
+	u64 bt_ba = 0;
+	int count;
 
 	eqc = mb_buf;
 	memset(eqc, 0, sizeof(struct hns_roce_eq_context));
 
 	/* init eqc */
 	eq->doorbell = hr_dev->reg_base + ROCEE_VF_EQ_DB_CFG0_REG;
-	eq->hop_num = hr_dev->caps.eqe_hop_num;
 	eq->cons_index = 0;
 	eq->over_ignore = HNS_ROCE_V2_EQ_OVER_IGNORE_0;
 	eq->coalesce = HNS_ROCE_V2_EQ_COALESCE_0;
 	eq->arm_st = HNS_ROCE_V2_EQ_ALWAYS_ARMED;
-	eq->eqe_ba_pg_sz = hr_dev->caps.eqe_ba_pg_sz;
-	eq->eqe_buf_pg_sz = hr_dev->caps.eqe_buf_pg_sz;
 	eq->shift = ilog2((unsigned int)eq->entries);
 
-	if (!eq->hop_num)
-		eq->eqe_ba = eq->buf_list->map;
-	else
-		eq->eqe_ba = eq->l0_dma;
+	/* if not multi-hop, eqe buffer only use one trunk */
+	count = hns_roce_mtr_find(hr_dev, &eq->mtr, 0, eqe_ba, MTT_MIN_COUNT,
+				  &bt_ba);
+	if (count < 1) {
+		dev_err(hr_dev->dev, "failed to find EQE mtr\n");
+		return -ENOBUFS;
+	}
 
 	/* set eqc state */
-	roce_set_field(eqc->byte_4,
-		       HNS_ROCE_EQC_EQ_ST_M,
-		       HNS_ROCE_EQC_EQ_ST_S,
+	roce_set_field(eqc->byte_4, HNS_ROCE_EQC_EQ_ST_M, HNS_ROCE_EQC_EQ_ST_S,
 		       HNS_ROCE_V2_EQ_STATE_VALID);
 
 	/* set eqe hop num */
-	roce_set_field(eqc->byte_4,
-		       HNS_ROCE_EQC_HOP_NUM_M,
+	roce_set_field(eqc->byte_4, HNS_ROCE_EQC_HOP_NUM_M,
 		       HNS_ROCE_EQC_HOP_NUM_S, eq->hop_num);
 
 	/* set eqc over_ignore */
-	roce_set_field(eqc->byte_4,
-		       HNS_ROCE_EQC_OVER_IGNORE_M,
+	roce_set_field(eqc->byte_4, HNS_ROCE_EQC_OVER_IGNORE_M,
 		       HNS_ROCE_EQC_OVER_IGNORE_S, eq->over_ignore);
 
 	/* set eqc coalesce */
-	roce_set_field(eqc->byte_4,
-		       HNS_ROCE_EQC_COALESCE_M,
+	roce_set_field(eqc->byte_4, HNS_ROCE_EQC_COALESCE_M,
 		       HNS_ROCE_EQC_COALESCE_S, eq->coalesce);
 
 	/* set eqc arm_state */
-	roce_set_field(eqc->byte_4,
-		       HNS_ROCE_EQC_ARM_ST_M,
+	roce_set_field(eqc->byte_4, HNS_ROCE_EQC_ARM_ST_M,
 		       HNS_ROCE_EQC_ARM_ST_S, eq->arm_st);
 
 	/* set eqn */
-	roce_set_field(eqc->byte_4,
-		       HNS_ROCE_EQC_EQN_M,
-		       HNS_ROCE_EQC_EQN_S, eq->eqn);
+	roce_set_field(eqc->byte_4, HNS_ROCE_EQC_EQN_M, HNS_ROCE_EQC_EQN_S,
+		       eq->eqn);
 
 	/* set eqe_cnt */
-	roce_set_field(eqc->byte_4,
-		       HNS_ROCE_EQC_EQE_CNT_M,
-		       HNS_ROCE_EQC_EQE_CNT_S,
-		       HNS_ROCE_EQ_INIT_EQE_CNT);
+	roce_set_field(eqc->byte_4, HNS_ROCE_EQC_EQE_CNT_M,
+		       HNS_ROCE_EQC_EQE_CNT_S, HNS_ROCE_EQ_INIT_EQE_CNT);
 
 	/* set eqe_ba_pg_sz */
-	roce_set_field(eqc->byte_8,
-		       HNS_ROCE_EQC_BA_PG_SZ_M,
+	roce_set_field(eqc->byte_8, HNS_ROCE_EQC_BA_PG_SZ_M,
 		       HNS_ROCE_EQC_BA_PG_SZ_S,
-		       eq->eqe_ba_pg_sz + PG_SHIFT_OFFSET);
+		       to_hr_hw_page_shift(eq->mtr.hem_cfg.ba_pg_shift));
 
 	/* set eqe_buf_pg_sz */
-	roce_set_field(eqc->byte_8,
-		       HNS_ROCE_EQC_BUF_PG_SZ_M,
+	roce_set_field(eqc->byte_8, HNS_ROCE_EQC_BUF_PG_SZ_M,
 		       HNS_ROCE_EQC_BUF_PG_SZ_S,
-		       eq->eqe_buf_pg_sz + PG_SHIFT_OFFSET);
+		       to_hr_hw_page_shift(eq->mtr.hem_cfg.buf_pg_shift));
 
 	/* set eq_producer_idx */
-	roce_set_field(eqc->byte_8,
-		       HNS_ROCE_EQC_PROD_INDX_M,
-		       HNS_ROCE_EQC_PROD_INDX_S,
-		       HNS_ROCE_EQ_INIT_PROD_IDX);
+	roce_set_field(eqc->byte_8, HNS_ROCE_EQC_PROD_INDX_M,
+		       HNS_ROCE_EQC_PROD_INDX_S, HNS_ROCE_EQ_INIT_PROD_IDX);
 
 	/* set eq_max_cnt */
-	roce_set_field(eqc->byte_12,
-		       HNS_ROCE_EQC_MAX_CNT_M,
+	roce_set_field(eqc->byte_12, HNS_ROCE_EQC_MAX_CNT_M,
 		       HNS_ROCE_EQC_MAX_CNT_S, eq->eq_max_cnt);
 
 	/* set eq_period */
-	roce_set_field(eqc->byte_12,
-		       HNS_ROCE_EQC_PERIOD_M,
+	roce_set_field(eqc->byte_12, HNS_ROCE_EQC_PERIOD_M,
 		       HNS_ROCE_EQC_PERIOD_S, eq->eq_period);
 
 	/* set eqe_report_timer */
-	roce_set_field(eqc->eqe_report_timer,
-		       HNS_ROCE_EQC_REPORT_TIMER_M,
+	roce_set_field(eqc->eqe_report_timer, HNS_ROCE_EQC_REPORT_TIMER_M,
 		       HNS_ROCE_EQC_REPORT_TIMER_S,
 		       HNS_ROCE_EQ_INIT_REPORT_TIMER);
 
-	/* set eqe_ba [34:3] */
-	roce_set_field(eqc->eqe_ba0,
-		       HNS_ROCE_EQC_EQE_BA_L_M,
-		       HNS_ROCE_EQC_EQE_BA_L_S, eq->eqe_ba >> 3);
+	/* set bt_ba [34:3] */
+	roce_set_field(eqc->eqe_ba0, HNS_ROCE_EQC_EQE_BA_L_M,
+		       HNS_ROCE_EQC_EQE_BA_L_S, bt_ba >> 3);
 
-	/* set eqe_ba [64:35] */
-	roce_set_field(eqc->eqe_ba1,
-		       HNS_ROCE_EQC_EQE_BA_H_M,
-		       HNS_ROCE_EQC_EQE_BA_H_S, eq->eqe_ba >> 35);
+	/* set bt_ba [64:35] */
+	roce_set_field(eqc->eqe_ba1, HNS_ROCE_EQC_EQE_BA_H_M,
+		       HNS_ROCE_EQC_EQE_BA_H_S, bt_ba >> 35);
 
 	/* set eq shift */
-	roce_set_field(eqc->byte_28,
-		       HNS_ROCE_EQC_SHIFT_M,
-		       HNS_ROCE_EQC_SHIFT_S, eq->shift);
+	roce_set_field(eqc->byte_28, HNS_ROCE_EQC_SHIFT_M, HNS_ROCE_EQC_SHIFT_S,
+		       eq->shift);
 
 	/* set eq MSI_IDX */
-	roce_set_field(eqc->byte_28,
-		       HNS_ROCE_EQC_MSI_INDX_M,
-		       HNS_ROCE_EQC_MSI_INDX_S,
-		       HNS_ROCE_EQ_INIT_MSI_IDX);
+	roce_set_field(eqc->byte_28, HNS_ROCE_EQC_MSI_INDX_M,
+		       HNS_ROCE_EQC_MSI_INDX_S, HNS_ROCE_EQ_INIT_MSI_IDX);
 
 	/* set cur_eqe_ba [27:12] */
-	roce_set_field(eqc->byte_28,
-		       HNS_ROCE_EQC_CUR_EQE_BA_L_M,
-		       HNS_ROCE_EQC_CUR_EQE_BA_L_S, eq->cur_eqe_ba >> 12);
+	roce_set_field(eqc->byte_28, HNS_ROCE_EQC_CUR_EQE_BA_L_M,
+		       HNS_ROCE_EQC_CUR_EQE_BA_L_S, eqe_ba[0] >> 12);
 
 	/* set cur_eqe_ba [59:28] */
-	roce_set_field(eqc->byte_32,
-		       HNS_ROCE_EQC_CUR_EQE_BA_M_M,
-		       HNS_ROCE_EQC_CUR_EQE_BA_M_S, eq->cur_eqe_ba >> 28);
+	roce_set_field(eqc->byte_32, HNS_ROCE_EQC_CUR_EQE_BA_M_M,
+		       HNS_ROCE_EQC_CUR_EQE_BA_M_S, eqe_ba[0] >> 28);
 
 	/* set cur_eqe_ba [63:60] */
-	roce_set_field(eqc->byte_36,
-		       HNS_ROCE_EQC_CUR_EQE_BA_H_M,
-		       HNS_ROCE_EQC_CUR_EQE_BA_H_S, eq->cur_eqe_ba >> 60);
+	roce_set_field(eqc->byte_36, HNS_ROCE_EQC_CUR_EQE_BA_H_M,
+		       HNS_ROCE_EQC_CUR_EQE_BA_H_S, eqe_ba[0] >> 60);
 
 	/* set eq consumer idx */
-	roce_set_field(eqc->byte_36,
-		       HNS_ROCE_EQC_CONS_INDX_M,
-		       HNS_ROCE_EQC_CONS_INDX_S,
-		       HNS_ROCE_EQ_INIT_CONS_IDX);
+	roce_set_field(eqc->byte_36, HNS_ROCE_EQC_CONS_INDX_M,
+		       HNS_ROCE_EQC_CONS_INDX_S, HNS_ROCE_EQ_INIT_CONS_IDX);
 
-	/* set nex_eqe_ba[43:12] */
-	roce_set_field(eqc->nxt_eqe_ba0,
-		       HNS_ROCE_EQC_NXT_EQE_BA_L_M,
-		       HNS_ROCE_EQC_NXT_EQE_BA_L_S, eq->nxt_eqe_ba >> 12);
+	roce_set_field(eqc->byte_40, HNS_ROCE_EQC_NXT_EQE_BA_L_M,
+		       HNS_ROCE_EQC_NXT_EQE_BA_L_S, eqe_ba[1] >> 12);
 
-	/* set nex_eqe_ba[63:44] */
-	roce_set_field(eqc->nxt_eqe_ba1,
-		       HNS_ROCE_EQC_NXT_EQE_BA_H_M,
-		       HNS_ROCE_EQC_NXT_EQE_BA_H_S, eq->nxt_eqe_ba >> 44);
-}
+	roce_set_field(eqc->byte_44, HNS_ROCE_EQC_NXT_EQE_BA_H_M,
+		       HNS_ROCE_EQC_NXT_EQE_BA_H_S, eqe_ba[1] >> 44);
 
-static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev,
-				  struct hns_roce_eq *eq)
-{
-	struct device *dev = hr_dev->dev;
-	int eq_alloc_done = 0;
-	int eq_buf_cnt = 0;
-	int eqe_alloc;
-	u32 buf_chk_sz;
-	u32 bt_chk_sz;
-	u32 mhop_num;
-	u64 size;
-	u64 idx;
-	int ba_num;
-	int bt_num;
-	int record_i;
-	int record_j;
-	int i = 0;
-	int j = 0;
-
-	mhop_num = hr_dev->caps.eqe_hop_num;
-	buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT);
-	bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT);
-
-	ba_num = DIV_ROUND_UP(PAGE_ALIGN(eq->entries * eq->eqe_size),
-			      buf_chk_sz);
-	bt_num = DIV_ROUND_UP(ba_num, bt_chk_sz / BA_BYTE_LEN);
-
-	if (mhop_num == HNS_ROCE_HOP_NUM_0) {
-		if (eq->entries > buf_chk_sz / eq->eqe_size) {
-			dev_err(dev, "eq entries %d is larger than buf_pg_sz!",
-				eq->entries);
-			return -EINVAL;
-		}
-		eq->bt_l0 = dma_alloc_coherent(dev, eq->entries * eq->eqe_size,
-					       &(eq->l0_dma), GFP_KERNEL);
-		if (!eq->bt_l0)
-			return -ENOMEM;
-
-		eq->cur_eqe_ba = eq->l0_dma;
-		eq->nxt_eqe_ba = 0;
-
-		return 0;
-	}
-
-	eq->buf_dma = kcalloc(ba_num, sizeof(*eq->buf_dma), GFP_KERNEL);
-	if (!eq->buf_dma)
-		return -ENOMEM;
-	eq->buf = kcalloc(ba_num, sizeof(*eq->buf), GFP_KERNEL);
-	if (!eq->buf)
-		goto err_kcalloc_buf;
-
-	if (mhop_num == 2) {
-		eq->l1_dma = kcalloc(bt_num, sizeof(*eq->l1_dma), GFP_KERNEL);
-		if (!eq->l1_dma)
-			goto err_kcalloc_l1_dma;
-
-		eq->bt_l1 = kcalloc(bt_num, sizeof(*eq->bt_l1), GFP_KERNEL);
-		if (!eq->bt_l1)
-			goto err_kcalloc_bt_l1;
-	}
-
-	/* alloc L0 BT */
-	eq->bt_l0 = dma_alloc_coherent(dev, bt_chk_sz, &eq->l0_dma, GFP_KERNEL);
-	if (!eq->bt_l0)
-		goto err_dma_alloc_l0;
-
-	if (mhop_num == 1) {
-		if (ba_num > (bt_chk_sz / BA_BYTE_LEN))
-			dev_err(dev, "ba_num %d is too large for 1 hop\n",
-				ba_num);
-
-		/* alloc buf */
-		for (i = 0; i < bt_chk_sz / BA_BYTE_LEN; i++) {
-			if (eq_buf_cnt + 1 < ba_num) {
-				size = buf_chk_sz;
-			} else {
-				eqe_alloc = i * (buf_chk_sz / eq->eqe_size);
-				size = (eq->entries - eqe_alloc) * eq->eqe_size;
-			}
-			eq->buf[i] = dma_alloc_coherent(dev, size,
-							&(eq->buf_dma[i]),
-							GFP_KERNEL);
-			if (!eq->buf[i])
-				goto err_dma_alloc_buf;
-
-			*(eq->bt_l0 + i) = eq->buf_dma[i];
-
-			eq_buf_cnt++;
-			if (eq_buf_cnt >= ba_num)
-				break;
-		}
-		eq->cur_eqe_ba = eq->buf_dma[0];
-		if (ba_num > 1)
-			eq->nxt_eqe_ba = eq->buf_dma[1];
-
-	} else if (mhop_num == 2) {
-		/* alloc L1 BT and buf */
-		for (i = 0; i < bt_chk_sz / BA_BYTE_LEN; i++) {
-			eq->bt_l1[i] = dma_alloc_coherent(dev, bt_chk_sz,
-							  &(eq->l1_dma[i]),
-							  GFP_KERNEL);
-			if (!eq->bt_l1[i])
-				goto err_dma_alloc_l1;
-			*(eq->bt_l0 + i) = eq->l1_dma[i];
-
-			for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
-				idx = i * bt_chk_sz / BA_BYTE_LEN + j;
-				if (eq_buf_cnt + 1 < ba_num) {
-					size = buf_chk_sz;
-				} else {
-					eqe_alloc = (buf_chk_sz / eq->eqe_size)
-						    * idx;
-					size = (eq->entries - eqe_alloc)
-						* eq->eqe_size;
-				}
-				eq->buf[idx] = dma_alloc_coherent(dev, size,
-								  &(eq->buf_dma[idx]),
-								  GFP_KERNEL);
-				if (!eq->buf[idx])
-					goto err_dma_alloc_buf;
-
-				*(eq->bt_l1[i] + j) = eq->buf_dma[idx];
-
-				eq_buf_cnt++;
-				if (eq_buf_cnt >= ba_num) {
-					eq_alloc_done = 1;
-					break;
-				}
-			}
-
-			if (eq_alloc_done)
-				break;
-		}
-		eq->cur_eqe_ba = eq->buf_dma[0];
-		if (ba_num > 1)
-			eq->nxt_eqe_ba = eq->buf_dma[1];
-	}
-
-	eq->l0_last_num = i + 1;
-	if (mhop_num == 2)
-		eq->l1_last_num = j + 1;
+	roce_set_field(eqc->byte_44, HNS_ROCE_EQC_EQE_SIZE_M,
+		       HNS_ROCE_EQC_EQE_SIZE_S,
+		       eq->eqe_size == HNS_ROCE_V3_EQE_SIZE ? 1 : 0);
 
 	return 0;
+}
 
-err_dma_alloc_l1:
-	dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma);
-	eq->bt_l0 = NULL;
-	eq->l0_dma = 0;
-	for (i -= 1; i >= 0; i--) {
-		dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
-				  eq->l1_dma[i]);
+static int alloc_eq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq)
+{
+	struct hns_roce_buf_attr buf_attr = {};
+	int err;
 
-		for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
-			idx = i * bt_chk_sz / BA_BYTE_LEN + j;
-			dma_free_coherent(dev, buf_chk_sz, eq->buf[idx],
-					  eq->buf_dma[idx]);
-		}
-	}
-	goto err_dma_alloc_l0;
+	if (hr_dev->caps.eqe_hop_num == HNS_ROCE_HOP_NUM_0)
+		eq->hop_num = 0;
+	else
+		eq->hop_num = hr_dev->caps.eqe_hop_num;
 
-err_dma_alloc_buf:
-	dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma);
-	eq->bt_l0 = NULL;
-	eq->l0_dma = 0;
+	buf_attr.page_shift = hr_dev->caps.eqe_buf_pg_sz + HNS_HW_PAGE_SHIFT;
+	buf_attr.region[0].size = eq->entries * eq->eqe_size;
+	buf_attr.region[0].hopnum = eq->hop_num;
+	buf_attr.region_count = 1;
+	buf_attr.fixed_page = true;
 
-	if (mhop_num == 1)
-		for (i -= 1; i >= 0; i--)
-			dma_free_coherent(dev, buf_chk_sz, eq->buf[i],
-					  eq->buf_dma[i]);
-	else if (mhop_num == 2) {
-		record_i = i;
-		record_j = j;
-		for (; i >= 0; i--) {
-			dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i],
-					  eq->l1_dma[i]);
+	err = hns_roce_mtr_create(hr_dev, &eq->mtr, &buf_attr,
+				  hr_dev->caps.eqe_ba_pg_sz +
+				  HNS_HW_PAGE_SHIFT, NULL, 0);
+	if (err)
+		dev_err(hr_dev->dev, "Failed to alloc EQE mtr, err %d\n", err);
 
-			for (j = 0; j < bt_chk_sz / BA_BYTE_LEN; j++) {
-				if (i == record_i && j >= record_j)
-					break;
-
-				idx = i * bt_chk_sz / BA_BYTE_LEN + j;
-				dma_free_coherent(dev, buf_chk_sz,
-						  eq->buf[idx],
-						  eq->buf_dma[idx]);
-			}
-		}
-	}
-
-err_dma_alloc_l0:
-	kfree(eq->bt_l1);
-	eq->bt_l1 = NULL;
-
-err_kcalloc_bt_l1:
-	kfree(eq->l1_dma);
-	eq->l1_dma = NULL;
-
-err_kcalloc_l1_dma:
-	kfree(eq->buf);
-	eq->buf = NULL;
-
-err_kcalloc_buf:
-	kfree(eq->buf_dma);
-	eq->buf_dma = NULL;
-
-	return -ENOMEM;
+	return err;
 }
 
 static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev,
 				 struct hns_roce_eq *eq,
 				 unsigned int eq_cmd)
 {
-	struct device *dev = hr_dev->dev;
 	struct hns_roce_cmd_mailbox *mailbox;
-	u32 buf_chk_sz = 0;
 	int ret;
 
 	/* Allocate mailbox memory */
 	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
-	if (IS_ERR(mailbox))
-		return PTR_ERR(mailbox);
+	if (IS_ERR_OR_NULL(mailbox))
+		return -ENOMEM;
 
-	if (!hr_dev->caps.eqe_hop_num) {
-		buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT);
+	ret = alloc_eq_buf(hr_dev, eq);
+	if (ret)
+		goto free_cmd_mbox;
 
-		eq->buf_list = kzalloc(sizeof(struct hns_roce_buf_list),
-				       GFP_KERNEL);
-		if (!eq->buf_list) {
-			ret = -ENOMEM;
-			goto free_cmd_mbox;
-		}
-
-		eq->buf_list->buf = dma_alloc_coherent(dev, buf_chk_sz,
-						       &(eq->buf_list->map),
-						       GFP_KERNEL);
-		if (!eq->buf_list->buf) {
-			ret = -ENOMEM;
-			goto err_alloc_buf;
-		}
-
-	} else {
-		ret = hns_roce_mhop_alloc_eq(hr_dev, eq);
-		if (ret) {
-			ret = -ENOMEM;
-			goto free_cmd_mbox;
-		}
-	}
-
-	hns_roce_config_eqc(hr_dev, eq, mailbox->buf);
+	ret = config_eqc(hr_dev, eq, mailbox->buf);
+	if (ret)
+		goto err_cmd_mbox;
 
 	ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, eq->eqn, 0,
 				eq_cmd, HNS_ROCE_CMD_TIMEOUT_MSECS);
 	if (ret) {
-		dev_err(dev, "[mailbox cmd] create eqc failed.\n");
+		dev_err(hr_dev->dev, "[mailbox cmd] create eqc failed.\n");
 		goto err_cmd_mbox;
 	}
 
@@ -5797,16 +5916,7 @@
 	return 0;
 
 err_cmd_mbox:
-	if (!hr_dev->caps.eqe_hop_num)
-		dma_free_coherent(dev, buf_chk_sz, eq->buf_list->buf,
-				  eq->buf_list->map);
-	else {
-		hns_roce_mhop_free_eq(hr_dev, eq);
-		goto free_cmd_mbox;
-	}
-
-err_alloc_buf:
-	kfree(eq->buf_list);
+	free_eq_buf(hr_dev, eq);
 
 free_cmd_mbox:
 	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
@@ -5832,18 +5942,16 @@
 
 	/* irq contains: abnormal + AEQ + CEQ */
 	for (j = 0; j < other_num; j++)
-		snprintf((char *)hr_dev->irq_names[j],
-			 HNS_ROCE_INT_NAME_LEN, "hns-abn-%d", j);
+		snprintf((char *)hr_dev->irq_names[j], HNS_ROCE_INT_NAME_LEN,
+			 "hns-abn-%d", j);
 
 	for (j = other_num; j < (other_num + aeq_num); j++)
-		snprintf((char *)hr_dev->irq_names[j],
-			 HNS_ROCE_INT_NAME_LEN, "hns-aeq-%d",
-			 j - other_num);
+		snprintf((char *)hr_dev->irq_names[j], HNS_ROCE_INT_NAME_LEN,
+			 "hns-aeq-%d", j - other_num);
 
 	for (j = (other_num + aeq_num); j < irq_num; j++)
-		snprintf((char *)hr_dev->irq_names[j],
-			 HNS_ROCE_INT_NAME_LEN, "hns-ceq-%d",
-			 j - other_num - aeq_num);
+		snprintf((char *)hr_dev->irq_names[j], HNS_ROCE_INT_NAME_LEN,
+			 "hns-ceq-%d", j - other_num - aeq_num);
 
 	for (j = 0; j < irq_num; j++) {
 		if (j < other_num)
@@ -5938,7 +6046,7 @@
 			eq_cmd = HNS_ROCE_CMD_CREATE_CEQC;
 			eq->type_flag = HNS_ROCE_CEQ;
 			eq->entries = hr_dev->caps.ceqe_depth;
-			eq->eqe_size = HNS_ROCE_CEQ_ENTRY_SIZE;
+			eq->eqe_size = hr_dev->caps.ceqe_size;
 			eq->irq = hr_dev->irq[i + other_num + aeq_num];
 			eq->eq_max_cnt = HNS_ROCE_CEQ_DEFAULT_BURST_NUM;
 			eq->eq_period = HNS_ROCE_CEQ_DEFAULT_INTERVAL;
@@ -5947,7 +6055,7 @@
 			eq_cmd = HNS_ROCE_CMD_CREATE_AEQC;
 			eq->type_flag = HNS_ROCE_AEQ;
 			eq->entries = hr_dev->caps.aeqe_depth;
-			eq->eqe_size = HNS_ROCE_AEQ_ENTRY_SIZE;
+			eq->eqe_size = hr_dev->caps.aeqe_size;
 			eq->irq = hr_dev->irq[i - comp_num + other_num];
 			eq->eq_max_cnt = HNS_ROCE_AEQ_DEFAULT_BURST_NUM;
 			eq->eq_period = HNS_ROCE_AEQ_DEFAULT_INTERVAL;
@@ -5970,8 +6078,7 @@
 		goto err_request_irq_fail;
 	}
 
-	hr_dev->irq_workq =
-		create_singlethread_workqueue("hns_roce_irq_workqueue");
+	hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0);
 	if (!hr_dev->irq_workq) {
 		dev_err(dev, "Create irq workqueue failed!\n");
 		ret = -ENOMEM;
@@ -5988,7 +6095,7 @@
 
 err_create_eq_fail:
 	for (i -= 1; i >= 0; i--)
-		hns_roce_v2_free_eq(hr_dev, &eq_table->eq[i]);
+		free_eq_buf(hr_dev, &eq_table->eq[i]);
 	kfree(eq_table->eq);
 
 	return ret;
@@ -6010,7 +6117,7 @@
 	for (i = 0; i < eq_num; i++) {
 		hns_roce_v2_destroy_eqc(hr_dev, i);
 
-		hns_roce_v2_free_eq(hr_dev, &eq_table->eq[i]);
+		free_eq_buf(hr_dev, &eq_table->eq[i]);
 	}
 
 	kfree(eq_table->eq);
@@ -6019,291 +6126,6 @@
 	destroy_workqueue(hr_dev->irq_workq);
 }
 
-static void hns_roce_v2_write_srqc(struct hns_roce_dev *hr_dev,
-				   struct hns_roce_srq *srq, u32 pdn, u16 xrcd,
-				   u32 cqn, void *mb_buf, u64 *mtts_wqe,
-				   u64 *mtts_idx, dma_addr_t dma_handle_wqe,
-				   dma_addr_t dma_handle_idx)
-{
-	struct hns_roce_srq_context *srq_context;
-
-	srq_context = mb_buf;
-	memset(srq_context, 0, sizeof(*srq_context));
-
-	roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQ_ST_M,
-		       SRQC_BYTE_4_SRQ_ST_S, 1);
-
-	roce_set_field(srq_context->byte_4_srqn_srqst,
-		       SRQC_BYTE_4_SRQ_WQE_HOP_NUM_M,
-		       SRQC_BYTE_4_SRQ_WQE_HOP_NUM_S,
-		       (hr_dev->caps.srqwqe_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 :
-		       hr_dev->caps.srqwqe_hop_num));
-	roce_set_field(srq_context->byte_4_srqn_srqst,
-		       SRQC_BYTE_4_SRQ_SHIFT_M, SRQC_BYTE_4_SRQ_SHIFT_S,
-		       ilog2(srq->max));
-
-	roce_set_field(srq_context->byte_4_srqn_srqst, SRQC_BYTE_4_SRQN_M,
-		       SRQC_BYTE_4_SRQN_S, srq->srqn);
-
-	roce_set_field(srq_context->byte_8_limit_wl, SRQC_BYTE_8_SRQ_LIMIT_WL_M,
-		       SRQC_BYTE_8_SRQ_LIMIT_WL_S, 0);
-
-	roce_set_field(srq_context->byte_12_xrcd, SRQC_BYTE_12_SRQ_XRCD_M,
-		       SRQC_BYTE_12_SRQ_XRCD_S, xrcd);
-
-	srq_context->wqe_bt_ba = cpu_to_le32((u32)(dma_handle_wqe >> 3));
-
-	roce_set_field(srq_context->byte_24_wqe_bt_ba,
-		       SRQC_BYTE_24_SRQ_WQE_BT_BA_M,
-		       SRQC_BYTE_24_SRQ_WQE_BT_BA_S,
-		       dma_handle_wqe >> 35);
-
-	roce_set_field(srq_context->byte_28_rqws_pd, SRQC_BYTE_28_PD_M,
-		       SRQC_BYTE_28_PD_S, pdn);
-	roce_set_field(srq_context->byte_28_rqws_pd, SRQC_BYTE_28_RQWS_M,
-		       SRQC_BYTE_28_RQWS_S, srq->max_gs <= 0 ? 0 :
-		       fls(srq->max_gs - 1));
-
-	srq_context->idx_bt_ba = cpu_to_le32(dma_handle_idx >> 3);
-	roce_set_field(srq_context->rsv_idx_bt_ba,
-		       SRQC_BYTE_36_SRQ_IDX_BT_BA_M,
-		       SRQC_BYTE_36_SRQ_IDX_BT_BA_S,
-		       dma_handle_idx >> 35);
-
-	srq_context->idx_cur_blk_addr =
-		cpu_to_le32(mtts_idx[0] >> PAGE_ADDR_SHIFT);
-	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
-		       SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_M,
-		       SRQC_BYTE_44_SRQ_IDX_CUR_BLK_ADDR_S,
-		       mtts_idx[0] >> (32 + PAGE_ADDR_SHIFT));
-	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
-		       SRQC_BYTE_44_SRQ_IDX_HOP_NUM_M,
-		       SRQC_BYTE_44_SRQ_IDX_HOP_NUM_S,
-		       hr_dev->caps.idx_hop_num == HNS_ROCE_HOP_NUM_0 ? 0 :
-		       hr_dev->caps.idx_hop_num);
-
-	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
-		       SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_M,
-		       SRQC_BYTE_44_SRQ_IDX_BA_PG_SZ_S,
-		       hr_dev->caps.idx_ba_pg_sz + PG_SHIFT_OFFSET);
-	roce_set_field(srq_context->byte_44_idxbufpgsz_addr,
-		       SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_M,
-		       SRQC_BYTE_44_SRQ_IDX_BUF_PG_SZ_S,
-		       hr_dev->caps.idx_buf_pg_sz + PG_SHIFT_OFFSET);
-
-	srq_context->idx_nxt_blk_addr =
-		cpu_to_le32(mtts_idx[1] >> PAGE_ADDR_SHIFT);
-	roce_set_field(srq_context->rsv_idxnxtblkaddr,
-		       SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_M,
-		       SRQC_BYTE_52_SRQ_IDX_NXT_BLK_ADDR_S,
-		       mtts_idx[1] >> (32 + PAGE_ADDR_SHIFT));
-	roce_set_field(srq_context->byte_56_xrc_cqn,
-		       SRQC_BYTE_56_SRQ_XRC_CQN_M, SRQC_BYTE_56_SRQ_XRC_CQN_S,
-		       cqn);
-	roce_set_field(srq_context->byte_56_xrc_cqn,
-		       SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_M,
-		       SRQC_BYTE_56_SRQ_WQE_BA_PG_SZ_S,
-		       hr_dev->caps.srqwqe_ba_pg_sz + PG_SHIFT_OFFSET);
-	roce_set_field(srq_context->byte_56_xrc_cqn,
-		       SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_M,
-		       SRQC_BYTE_56_SRQ_WQE_BUF_PG_SZ_S,
-		       hr_dev->caps.srqwqe_buf_pg_sz + PG_SHIFT_OFFSET);
-
-	roce_set_bit(srq_context->db_record_addr_record_en,
-		     SRQC_BYTE_60_SRQ_RECORD_EN_S, 0);
-}
-
-static int hns_roce_v2_modify_srq(struct ib_srq *ibsrq,
-				  struct ib_srq_attr *srq_attr,
-				  enum ib_srq_attr_mask srq_attr_mask,
-				  struct ib_udata *udata)
-{
-	struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
-	struct hns_roce_srq *srq = to_hr_srq(ibsrq);
-	struct hns_roce_srq_context *srq_context;
-	struct hns_roce_srq_context *srqc_mask;
-	struct hns_roce_cmd_mailbox *mailbox;
-	int ret;
-
-	if (srq_attr_mask & IB_SRQ_LIMIT) {
-		if (srq_attr->srq_limit >= srq->max)
-			return -EINVAL;
-
-		mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
-		if (IS_ERR(mailbox))
-			return PTR_ERR(mailbox);
-
-		srq_context = mailbox->buf;
-		srqc_mask = (struct hns_roce_srq_context *)mailbox->buf + 1;
-
-		memset(srqc_mask, 0xff, sizeof(*srqc_mask));
-
-		roce_set_field(srq_context->byte_8_limit_wl,
-			       SRQC_BYTE_8_SRQ_LIMIT_WL_M,
-			       SRQC_BYTE_8_SRQ_LIMIT_WL_S, srq_attr->srq_limit);
-		roce_set_field(srqc_mask->byte_8_limit_wl,
-			       SRQC_BYTE_8_SRQ_LIMIT_WL_M,
-			       SRQC_BYTE_8_SRQ_LIMIT_WL_S, 0);
-
-		ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, srq->srqn, 0,
-					HNS_ROCE_CMD_MODIFY_SRQC,
-					HNS_ROCE_CMD_TIMEOUT_MSECS);
-		hns_roce_free_cmd_mailbox(hr_dev, mailbox);
-		if (ret) {
-			dev_err(hr_dev->dev,
-				"MODIFY SRQ Failed to cmd mailbox.\n");
-			return ret;
-		}
-	}
-
-	return 0;
-}
-
-static int hns_roce_v2_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
-{
-	struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
-	struct hns_roce_srq *srq = to_hr_srq(ibsrq);
-	struct hns_roce_srq_context *srq_context;
-	struct hns_roce_cmd_mailbox *mailbox;
-	int limit_wl;
-	int ret;
-
-	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
-	if (IS_ERR(mailbox))
-		return PTR_ERR(mailbox);
-
-	srq_context = mailbox->buf;
-	ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, srq->srqn, 0,
-				HNS_ROCE_CMD_QUERY_SRQC,
-				HNS_ROCE_CMD_TIMEOUT_MSECS);
-	if (ret) {
-		dev_err(hr_dev->dev, "QUERY SRQ cmd process error\n");
-		goto out;
-	}
-
-	limit_wl = roce_get_field(srq_context->byte_8_limit_wl,
-				  SRQC_BYTE_8_SRQ_LIMIT_WL_M,
-				  SRQC_BYTE_8_SRQ_LIMIT_WL_S);
-
-	attr->srq_limit = limit_wl;
-	attr->max_wr    = srq->max - 1;
-	attr->max_sge   = srq->max_gs;
-
-	memcpy(srq_context, mailbox->buf, sizeof(*srq_context));
-
-out:
-	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
-	return ret;
-}
-
-static int find_empty_entry(struct hns_roce_idx_que *idx_que,
-			    unsigned long size)
-{
-	int wqe_idx;
-
-	if (unlikely(bitmap_full(idx_que->bitmap, size)))
-		return -ENOSPC;
-
-	wqe_idx = find_first_zero_bit(idx_que->bitmap, size);
-
-	bitmap_set(idx_que->bitmap, wqe_idx, 1);
-
-	return wqe_idx;
-}
-
-static void fill_idx_queue(struct hns_roce_idx_que *idx_que,
-			   int cur_idx, int wqe_idx)
-{
-	unsigned int *addr;
-
-	addr = (unsigned int *)hns_roce_buf_offset(&idx_que->idx_buf,
-						   cur_idx * idx_que->entry_sz);
-	*addr = wqe_idx;
-}
-
-static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
-				     const struct ib_recv_wr *wr,
-				     const struct ib_recv_wr **bad_wr)
-{
-	struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
-	struct hns_roce_srq *srq = to_hr_srq(ibsrq);
-	struct hns_roce_v2_wqe_data_seg *dseg;
-	struct hns_roce_v2_db srq_db;
-	unsigned long flags;
-	int ret = 0;
-	int wqe_idx;
-	void *wqe;
-	int nreq;
-	int ind;
-	int i;
-
-	spin_lock_irqsave(&srq->lock, flags);
-
-	ind = srq->head & (srq->max - 1);
-
-	for (nreq = 0; wr; ++nreq, wr = wr->next) {
-		if (unlikely(wr->num_sge > srq->max_gs)) {
-			ret = -EINVAL;
-			*bad_wr = wr;
-			break;
-		}
-
-		if (unlikely(srq->head == srq->tail)) {
-			ret = -ENOMEM;
-			*bad_wr = wr;
-			break;
-		}
-
-		wqe_idx = find_empty_entry(&srq->idx_que, srq->max);
-		if (wqe_idx < 0) {
-			ret = -ENOMEM;
-			*bad_wr = wr;
-			break;
-		}
-
-		fill_idx_queue(&srq->idx_que, ind, wqe_idx);
-		wqe = get_srq_wqe(srq, wqe_idx);
-		dseg = (struct hns_roce_v2_wqe_data_seg *)wqe;
-
-		for (i = 0; i < wr->num_sge; ++i) {
-			dseg[i].len = cpu_to_le32(wr->sg_list[i].length);
-			dseg[i].lkey = cpu_to_le32(wr->sg_list[i].lkey);
-			dseg[i].addr = cpu_to_le64(wr->sg_list[i].addr);
-		}
-
-		if (i < srq->max_gs) {
-			dseg[i].len = 0;
-			dseg[i].lkey = cpu_to_le32(0x100);
-			dseg[i].addr = 0;
-		}
-
-		srq->wrid[wqe_idx] = wr->wr_id;
-		ind = (ind + 1) & (srq->max - 1);
-	}
-
-	if (likely(nreq)) {
-		srq->head += nreq;
-
-		/*
-		 * Make sure that descriptors are written before
-		 * doorbell record.
-		 */
-		wmb();
-
-		srq_db.byte_4 =
-			cpu_to_le32(HNS_ROCE_V2_SRQ_DB << V2_DB_BYTE_4_CMD_S |
-				    (srq->srqn & V2_DB_BYTE_4_TAG_M));
-		srq_db.parameter = cpu_to_le32(srq->head);
-
-		hns_roce_write64(hr_dev, (__le32 *)&srq_db, srq->db_reg_l);
-
-	}
-
-	spin_unlock_irqrestore(&srq->lock, flags);
-
-	return ret;
-}
-
 static const struct hns_roce_dfx_hw hns_roce_dfx_hw_v2 = {
 	.query_cqc_info = hns_roce_v2_query_cqc_info,
 };
@@ -6373,12 +6195,14 @@
 
 MODULE_DEVICE_TABLE(pci, hns_roce_hw_v2_pci_tbl);
 
-static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev,
+static void hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev,
 				  struct hnae3_handle *handle)
 {
 	struct hns_roce_v2_priv *priv = hr_dev->priv;
 	int i;
 
+	hr_dev->pci_dev = handle->pdev;
+	hr_dev->dev = &handle->pdev->dev;
 	hr_dev->hw = &hns_roce_hw_v2;
 	hr_dev->dfx = &hns_roce_dfx_hw_v2;
 	hr_dev->sdb_offset = ROCEE_DB_SQ_L_0_REG;
@@ -6403,8 +6227,6 @@
 
 	hr_dev->reset_cnt = handle->ae_algo->ops->ae_dev_reset_cnt(handle);
 	priv->handle = handle;
-
-	return 0;
 }
 
 static int __hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
@@ -6422,14 +6244,7 @@
 		goto error_failed_kzalloc;
 	}
 
-	hr_dev->pci_dev = handle->pdev;
-	hr_dev->dev = &handle->pdev->dev;
-
-	ret = hns_roce_hw_v2_get_cfg(hr_dev, handle);
-	if (ret) {
-		dev_err(hr_dev->dev, "Get Configuration failed!\n");
-		goto error_failed_get_cfg;
-	}
+	hns_roce_hw_v2_get_cfg(hr_dev, handle);
 
 	ret = hns_roce_init(hr_dev);
 	if (ret) {
@@ -6453,12 +6268,16 @@
 static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
 					   bool reset)
 {
-	struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv;
+	struct hns_roce_dev *hr_dev = handle->priv;
 
 	if (!hr_dev)
 		return;
 
 	handle->priv = NULL;
+
+	hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT;
+	hns_roce_handle_device_err(hr_dev);
+
 	hns_roce_exit(hr_dev);
 	kfree(hr_dev->priv);
 	ib_dealloc_device(&hr_dev->ib_dev);
@@ -6520,7 +6339,6 @@
 static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
 {
 	struct hns_roce_dev *hr_dev;
-	struct ib_event event;
 
 	if (handle->rinfo.instance_state != HNS_ROCE_STATE_INITED) {
 		set_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state);
@@ -6530,18 +6348,13 @@
 	handle->rinfo.reset_state = HNS_ROCE_STATE_RST_DOWN;
 	clear_bit(HNS_ROCE_RST_DIRECT_RETURN, &handle->rinfo.state);
 
-	hr_dev = (struct hns_roce_dev *)handle->priv;
+	hr_dev = handle->priv;
 	if (!hr_dev)
 		return 0;
 
-	hr_dev->is_reset = true;
 	hr_dev->active = false;
 	hr_dev->dis_db = true;
-
-	event.event = IB_EVENT_DEVICE_FATAL;
-	event.device = &hr_dev->ib_dev;
-	event.element.port_num = 1;
-	ib_dispatch_event(&event);
+	hr_dev->state = HNS_ROCE_DEVICE_STATE_RST_DOWN;
 
 	return 0;
 }
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
index 76a14db..be7f2fe 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h
@@ -50,17 +50,17 @@
 #define HNS_ROCE_V2_MAX_WQE_NUM			0x8000
 #define	HNS_ROCE_V2_MAX_SRQ			0x100000
 #define HNS_ROCE_V2_MAX_SRQ_WR			0x8000
-#define HNS_ROCE_V2_MAX_SRQ_SGE			0x100
+#define HNS_ROCE_V2_MAX_SRQ_SGE			64
 #define HNS_ROCE_V2_MAX_CQ_NUM			0x100000
 #define HNS_ROCE_V2_MAX_CQC_TIMER_NUM		0x100
 #define HNS_ROCE_V2_MAX_SRQ_NUM			0x100000
 #define HNS_ROCE_V2_MAX_CQE_NUM			0x400000
 #define HNS_ROCE_V2_MAX_SRQWQE_NUM		0x8000
-#define HNS_ROCE_V2_MAX_RQ_SGE_NUM		0x100
-#define HNS_ROCE_V2_MAX_SQ_SGE_NUM		0xff
-#define HNS_ROCE_V2_MAX_SRQ_SGE_NUM		0x100
+#define HNS_ROCE_V2_MAX_RQ_SGE_NUM		64
+#define HNS_ROCE_V2_MAX_SQ_SGE_NUM		64
 #define HNS_ROCE_V2_MAX_EXTEND_SGE_NUM		0x200000
 #define HNS_ROCE_V2_MAX_SQ_INLINE		0x20
+#define HNS_ROCE_V2_MAX_RC_INL_INN_SZ		32
 #define HNS_ROCE_V2_UAR_NUM			256
 #define HNS_ROCE_V2_PHY_UAR_NUM			1
 #define HNS_ROCE_V2_MAX_IRQ_NUM			65
@@ -78,15 +78,18 @@
 #define HNS_ROCE_V2_MAX_SQ_DESC_SZ		64
 #define HNS_ROCE_V2_MAX_RQ_DESC_SZ		16
 #define HNS_ROCE_V2_MAX_SRQ_DESC_SZ		64
-#define HNS_ROCE_V2_QPC_ENTRY_SZ		256
 #define HNS_ROCE_V2_IRRL_ENTRY_SZ		64
 #define HNS_ROCE_V2_TRRL_ENTRY_SZ		48
+#define HNS_ROCE_V2_EXT_ATOMIC_TRRL_ENTRY_SZ	100
 #define HNS_ROCE_V2_CQC_ENTRY_SZ		64
 #define HNS_ROCE_V2_SRQC_ENTRY_SZ		64
 #define HNS_ROCE_V2_MTPT_ENTRY_SZ		64
 #define HNS_ROCE_V2_MTT_ENTRY_SZ		64
-#define HNS_ROCE_V2_CQE_ENTRY_SIZE		32
-#define HNS_ROCE_V2_SCCC_ENTRY_SZ		32
+#define HNS_ROCE_V2_IDX_ENTRY_SZ		4
+
+#define HNS_ROCE_V2_SCCC_SZ			32
+#define HNS_ROCE_V3_SCCC_SZ			64
+
 #define HNS_ROCE_V2_QPC_TIMER_ENTRY_SZ		PAGE_SIZE
 #define HNS_ROCE_V2_CQC_TIMER_ENTRY_SZ		PAGE_SIZE
 #define HNS_ROCE_V2_PAGE_SIZE_SUPPORTED		0xFFFFF000
@@ -109,7 +112,12 @@
 #define HNS_ROCE_PBL_HOP_NUM			2
 #define HNS_ROCE_EQE_HOP_NUM			2
 #define HNS_ROCE_IDX_HOP_NUM			1
+#define HNS_ROCE_SQWQE_HOP_NUM			2
+#define HNS_ROCE_EXT_SGE_HOP_NUM		1
+#define HNS_ROCE_RQWQE_HOP_NUM			2
 
+#define HNS_ROCE_BA_PG_SZ_SUPPORTED_256K	6
+#define HNS_ROCE_BA_PG_SZ_SUPPORTED_16K		2
 #define HNS_ROCE_V2_GID_INDEX_NUM		256
 
 #define HNS_ROCE_V2_TABLE_CHUNK_SIZE		(1 << 18)
@@ -156,7 +164,7 @@
 
 #define	GID_LEN_V2				16
 
-#define HNS_ROCE_V2_CQE_QPN_MASK		0x3ffff
+#define HNS_ROCE_V2_CQE_QPN_MASK		0xfffff
 
 enum {
 	HNS_ROCE_V2_WQE_OP_SEND				= 0x0,
@@ -171,27 +179,11 @@
 	HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD	= 0x9,
 	HNS_ROCE_V2_WQE_OP_FAST_REG_PMR			= 0xa,
 	HNS_ROCE_V2_WQE_OP_LOCAL_INV			= 0xb,
-	HNS_ROCE_V2_WQE_OP_BIND_MW_TYPE			= 0xc,
+	HNS_ROCE_V2_WQE_OP_BIND_MW			= 0xc,
 	HNS_ROCE_V2_WQE_OP_MASK				= 0x1f,
 };
 
 enum {
-	HNS_ROCE_SQ_OPCODE_SEND = 0x0,
-	HNS_ROCE_SQ_OPCODE_SEND_WITH_INV = 0x1,
-	HNS_ROCE_SQ_OPCODE_SEND_WITH_IMM = 0x2,
-	HNS_ROCE_SQ_OPCODE_RDMA_WRITE = 0x3,
-	HNS_ROCE_SQ_OPCODE_RDMA_WRITE_WITH_IMM = 0x4,
-	HNS_ROCE_SQ_OPCODE_RDMA_READ = 0x5,
-	HNS_ROCE_SQ_OPCODE_ATOMIC_COMP_AND_SWAP = 0x6,
-	HNS_ROCE_SQ_OPCODE_ATOMIC_FETCH_AND_ADD = 0x7,
-	HNS_ROCE_SQ_OPCODE_ATOMIC_MASK_COMP_AND_SWAP = 0x8,
-	HNS_ROCE_SQ_OPCODE_ATOMIC_MASK_FETCH_AND_ADD = 0x9,
-	HNS_ROCE_SQ_OPCODE_FAST_REG_WR = 0xa,
-	HNS_ROCE_SQ_OPCODE_LOCAL_INV = 0xb,
-	HNS_ROCE_SQ_OPCODE_BIND_MW = 0xc,
-};
-
-enum {
 	/* rq operations */
 	HNS_ROCE_V2_OPCODE_RDMA_WRITE_IMM = 0x0,
 	HNS_ROCE_V2_OPCODE_SEND = 0x1,
@@ -222,6 +214,7 @@
 	HNS_ROCE_CQE_V2_TRANSPORT_RETRY_EXC_ERR		= 0x15,
 	HNS_ROCE_CQE_V2_RNR_RETRY_EXC_ERR		= 0x16,
 	HNS_ROCE_CQE_V2_REMOTE_ABORT_ERR		= 0x22,
+	HNS_ROCE_CQE_V2_GENERAL_ERR			= 0x23,
 
 	HNS_ROCE_V2_CQE_STATUS_MASK			= 0xff,
 };
@@ -237,6 +230,8 @@
 	HNS_ROCE_OPC_CFG_EXT_LLM			= 0x8403,
 	HNS_ROCE_OPC_CFG_TMOUT_LLM			= 0x8404,
 	HNS_ROCE_OPC_QUERY_PF_TIMER_RES			= 0x8406,
+	HNS_ROCE_OPC_QUERY_PF_CAPS_NUM                  = 0x8408,
+	HNS_ROCE_OPC_CFG_ENTRY_SIZE			= 0x8409,
 	HNS_ROCE_OPC_CFG_SGID_TB			= 0x8500,
 	HNS_ROCE_OPC_CFG_SMAC_TB			= 0x8501,
 	HNS_ROCE_OPC_POST_MB				= 0x8504,
@@ -317,6 +312,9 @@
 #define	V2_CQC_BYTE_8_CQN_S 0
 #define V2_CQC_BYTE_8_CQN_M GENMASK(23, 0)
 
+#define V2_CQC_BYTE_8_CQE_SIZE_S 27
+#define V2_CQC_BYTE_8_CQE_SIZE_M GENMASK(28, 27)
+
 #define	V2_CQC_BYTE_16_CQE_CUR_BLK_ADDR_S 0
 #define V2_CQC_BYTE_16_CQE_CUR_BLK_ADDR_M GENMASK(19, 0)
 
@@ -452,8 +450,8 @@
 	HNS_ROCE_QP_ST_INIT,
 	HNS_ROCE_QP_ST_RTR,
 	HNS_ROCE_QP_ST_RTS,
-	HNS_ROCE_QP_ST_SQER,
 	HNS_ROCE_QP_ST_SQD,
+	HNS_ROCE_QP_ST_SQER,
 	HNS_ROCE_QP_ST_ERR,
 	HNS_ROCE_QP_ST_SQ_DRAINING,
 	HNS_ROCE_QP_NUM_ST
@@ -520,6 +518,7 @@
 	__le32	byte_248_ack_psn;
 	__le32	byte_252_err_txcqn;
 	__le32	byte_256_sqflush_rqcqe;
+	__le32	ext[64];
 };
 
 #define	V2_QPC_BYTE_4_TST_S 0
@@ -643,7 +642,7 @@
 #define	V2_QPC_BYTE_76_ATE_S 27
 
 #define	V2_QPC_BYTE_76_RQIE_S 28
-
+#define	V2_QPC_BYTE_76_EXT_ATE_S 29
 #define	V2_QPC_BYTE_76_RQ_VLAN_EN_S 30
 #define	V2_QPC_BYTE_80_RX_CQN_S 0
 #define V2_QPC_BYTE_80_RX_CQN_M GENMASK(23, 0)
@@ -904,6 +903,7 @@
 	u8	smac[4];
 	__le32	byte_28;
 	__le32	byte_32;
+	__le32	rsv[8];
 };
 
 #define	V2_CQE_BYTE_4_OPCODE_S 0
@@ -1048,11 +1048,6 @@
 #define V2_DB_PARAMETER_SL_S 16
 #define V2_DB_PARAMETER_SL_M GENMASK(18, 16)
 
-struct hns_roce_v2_cq_db {
-	__le32	byte_4;
-	__le32	parameter;
-};
-
 #define	V2_CQ_DB_BYTE_4_TAG_S 0
 #define V2_CQ_DB_BYTE_4_TAG_M GENMASK(23, 0)
 
@@ -1200,6 +1195,8 @@
 #define V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S 0
 #define V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M GENMASK(23, 0)
 
+#define V2_RC_SEND_WQE_BYTE_20_INL_TYPE_S 31
+
 struct hns_roce_wqe_frmr_seg {
 	__le32	pbl_size;
 	__le32	mode_buf_pg_sz;
@@ -1239,10 +1236,9 @@
 };
 
 #define FUNC_CLEAR_RST_FUN_DONE_S 0
-/* Each physical function manages up to 248 virtual functionsï¼›
- * it takes up to 100ms for each function to execute clearï¼›
- * if an abnormal reset occurs, it is executed twice at most;
- * so it takes up to 249 * 2 * 100ms.
+/* Each physical function manages up to 248 virtual functions, it takes up to
+ * 100ms for each function to execute clear. If an abnormal reset occurs, it is
+ * executed twice at most, so it takes up to 249 * 2 * 100ms.
  */
 #define HNS_ROCE_V2_FUNC_CLEAR_TIMEOUT_MSECS	(249 * 2 * 100)
 #define HNS_ROCE_V2_READ_FUNC_CLEAR_FLAG_INTERVAL	40
@@ -1551,6 +1547,18 @@
 	__le32	vf_sgid_h;
 	__le32	vf_sgid_type_rsv;
 };
+
+enum {
+	HNS_ROCE_CFG_QPC_SIZE = BIT(0),
+	HNS_ROCE_CFG_SCCC_SIZE = BIT(1),
+};
+
+struct hns_roce_cfg_entry_size {
+	__le32	type;
+	__le32	rsv[4];
+	__le32	size;
+};
+
 #define CFG_SGID_TB_TABLE_IDX_S 0
 #define CFG_SGID_TB_TABLE_IDX_M GENMASK(7, 0)
 
@@ -1569,6 +1577,155 @@
 #define CFG_SMAC_TB_VF_SMAC_H_S 0
 #define CFG_SMAC_TB_VF_SMAC_H_M GENMASK(15, 0)
 
+#define HNS_ROCE_QUERY_PF_CAPS_CMD_NUM 5
+struct hns_roce_query_pf_caps_a {
+	u8 number_ports;
+	u8 local_ca_ack_delay;
+	__le16 max_sq_sg;
+	__le16 max_sq_inline;
+	__le16 max_rq_sg;
+	__le32 max_extend_sg;
+	__le16 num_qpc_timer;
+	__le16 num_cqc_timer;
+	__le16 max_srq_sges;
+	u8 num_aeq_vectors;
+	u8 num_other_vectors;
+	u8 max_sq_desc_sz;
+	u8 max_rq_desc_sz;
+	u8 max_srq_desc_sz;
+	u8 cqe_sz;
+};
+
+struct hns_roce_query_pf_caps_b {
+	u8 mtpt_entry_sz;
+	u8 irrl_entry_sz;
+	u8 trrl_entry_sz;
+	u8 cqc_entry_sz;
+	u8 srqc_entry_sz;
+	u8 idx_entry_sz;
+	u8 sccc_sz;
+	u8 max_mtu;
+	__le16 qpc_sz;
+	__le16 qpc_timer_entry_sz;
+	__le16 cqc_timer_entry_sz;
+	u8 min_cqes;
+	u8 min_wqes;
+	__le32 page_size_cap;
+	u8 pkey_table_len;
+	u8 phy_num_uars;
+	u8 ctx_hop_num;
+	u8 pbl_hop_num;
+};
+
+struct hns_roce_query_pf_caps_c {
+	__le32 cap_flags_num_pds;
+	__le32 max_gid_num_cqs;
+	__le32 cq_depth;
+	__le32 num_mrws;
+	__le32 ord_num_qps;
+	__le16 sq_depth;
+	__le16 rq_depth;
+};
+
+#define V2_QUERY_PF_CAPS_C_NUM_PDS_S 0
+#define V2_QUERY_PF_CAPS_C_NUM_PDS_M GENMASK(19, 0)
+
+#define V2_QUERY_PF_CAPS_C_CAP_FLAGS_S 20
+#define V2_QUERY_PF_CAPS_C_CAP_FLAGS_M GENMASK(31, 20)
+
+#define V2_QUERY_PF_CAPS_C_NUM_CQS_S 0
+#define V2_QUERY_PF_CAPS_C_NUM_CQS_M GENMASK(19, 0)
+
+#define V2_QUERY_PF_CAPS_C_MAX_GID_S 20
+#define V2_QUERY_PF_CAPS_C_MAX_GID_M GENMASK(28, 20)
+
+#define V2_QUERY_PF_CAPS_C_CQ_DEPTH_S 0
+#define V2_QUERY_PF_CAPS_C_CQ_DEPTH_M GENMASK(22, 0)
+
+#define V2_QUERY_PF_CAPS_C_NUM_MRWS_S 0
+#define V2_QUERY_PF_CAPS_C_NUM_MRWS_M GENMASK(19, 0)
+
+#define V2_QUERY_PF_CAPS_C_NUM_QPS_S 0
+#define V2_QUERY_PF_CAPS_C_NUM_QPS_M GENMASK(19, 0)
+
+#define V2_QUERY_PF_CAPS_C_MAX_ORD_S 20
+#define V2_QUERY_PF_CAPS_C_MAX_ORD_M GENMASK(27, 20)
+
+struct hns_roce_query_pf_caps_d {
+	__le32 wq_hop_num_max_srqs;
+	__le16 srq_depth;
+	__le16 cap_flags_ex;
+	__le32 num_ceqs_ceq_depth;
+	__le32 arm_st_aeq_depth;
+	__le32 num_uars_rsv_pds;
+	__le32 rsv_uars_rsv_qps;
+};
+#define V2_QUERY_PF_CAPS_D_NUM_SRQS_S 0
+#define V2_QUERY_PF_CAPS_D_NUM_SRQS_M GENMASK(19, 0)
+
+#define V2_QUERY_PF_CAPS_D_RQWQE_HOP_NUM_S 20
+#define V2_QUERY_PF_CAPS_D_RQWQE_HOP_NUM_M GENMASK(21, 20)
+
+#define V2_QUERY_PF_CAPS_D_EX_SGE_HOP_NUM_S 22
+#define V2_QUERY_PF_CAPS_D_EX_SGE_HOP_NUM_M GENMASK(23, 22)
+
+#define V2_QUERY_PF_CAPS_D_SQWQE_HOP_NUM_S 24
+#define V2_QUERY_PF_CAPS_D_SQWQE_HOP_NUM_M GENMASK(25, 24)
+
+
+#define V2_QUERY_PF_CAPS_D_CEQ_DEPTH_S 0
+#define V2_QUERY_PF_CAPS_D_CEQ_DEPTH_M GENMASK(21, 0)
+
+#define V2_QUERY_PF_CAPS_D_NUM_CEQS_S 22
+#define V2_QUERY_PF_CAPS_D_NUM_CEQS_M GENMASK(31, 22)
+
+#define V2_QUERY_PF_CAPS_D_AEQ_DEPTH_S 0
+#define V2_QUERY_PF_CAPS_D_AEQ_DEPTH_M GENMASK(21, 0)
+
+#define V2_QUERY_PF_CAPS_D_AEQ_ARM_ST_S 22
+#define V2_QUERY_PF_CAPS_D_AEQ_ARM_ST_M GENMASK(23, 22)
+
+#define V2_QUERY_PF_CAPS_D_CEQ_ARM_ST_S 24
+#define V2_QUERY_PF_CAPS_D_CEQ_ARM_ST_M GENMASK(25, 24)
+
+#define V2_QUERY_PF_CAPS_D_RSV_PDS_S 0
+#define V2_QUERY_PF_CAPS_D_RSV_PDS_M GENMASK(19, 0)
+
+#define V2_QUERY_PF_CAPS_D_NUM_UARS_S 20
+#define V2_QUERY_PF_CAPS_D_NUM_UARS_M GENMASK(27, 20)
+
+#define V2_QUERY_PF_CAPS_D_RSV_QPS_S 0
+#define V2_QUERY_PF_CAPS_D_RSV_QPS_M GENMASK(19, 0)
+
+#define V2_QUERY_PF_CAPS_D_RSV_UARS_S 20
+#define V2_QUERY_PF_CAPS_D_RSV_UARS_M GENMASK(27, 20)
+
+struct hns_roce_query_pf_caps_e {
+	__le32 chunk_size_shift_rsv_mrws;
+	__le32 rsv_cqs;
+	__le32 rsv_srqs;
+	__le32 rsv_lkey;
+	__le16 ceq_max_cnt;
+	__le16 ceq_period;
+	__le16 aeq_max_cnt;
+	__le16 aeq_period;
+};
+
+#define V2_QUERY_PF_CAPS_E_RSV_MRWS_S 0
+#define V2_QUERY_PF_CAPS_E_RSV_MRWS_M GENMASK(19, 0)
+
+#define V2_QUERY_PF_CAPS_E_CHUNK_SIZE_SHIFT_S 20
+#define V2_QUERY_PF_CAPS_E_CHUNK_SIZE_SHIFT_M GENMASK(31, 20)
+
+#define V2_QUERY_PF_CAPS_E_RSV_CQS_S 0
+#define V2_QUERY_PF_CAPS_E_RSV_CQS_M GENMASK(19, 0)
+
+#define V2_QUERY_PF_CAPS_E_RSV_SRQS_S 0
+#define V2_QUERY_PF_CAPS_E_RSV_SRQS_M GENMASK(19, 0)
+
+#define V2_QUERY_PF_CAPS_E_RSV_LKEYS_S 0
+#define V2_QUERY_PF_CAPS_E_RSV_LKEYS_M GENMASK(19, 0)
+
 struct hns_roce_cmq_desc {
 	__le16 opcode;
 	__le16 flag;
@@ -1642,8 +1799,8 @@
 	__le32	byte_28;
 	__le32	byte_32;
 	__le32	byte_36;
-	__le32	nxt_eqe_ba0;
-	__le32	nxt_eqe_ba1;
+	__le32	byte_40;
+	__le32	byte_44;
 	__le32	rsv[5];
 };
 
@@ -1785,6 +1942,9 @@
 #define HNS_ROCE_EQC_NXT_EQE_BA_H_S 0
 #define HNS_ROCE_EQC_NXT_EQE_BA_H_M GENMASK(19, 0)
 
+#define HNS_ROCE_EQC_EQE_SIZE_S 20
+#define HNS_ROCE_EQC_EQE_SIZE_M GENMASK(21, 20)
+
 #define HNS_ROCE_V2_CEQE_COMP_CQN_S 0
 #define HNS_ROCE_V2_CEQE_COMP_CQN_M GENMASK(23, 0)
 
@@ -1806,6 +1966,8 @@
 #define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S 0
 #define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M GENMASK(23, 0)
 
+#define MAX_SERVICE_LEVEL 0x7
+
 struct hns_roce_wqe_atomic_seg {
 	__le64          fetchadd_swap_data;
 	__le64          cmp_data;
@@ -1827,7 +1989,7 @@
 static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2],
 				    void __iomem *dest)
 {
-	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
+	struct hns_roce_v2_priv *priv = hr_dev->priv;
 	struct hnae3_handle *handle = priv->handle;
 	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index f23a341..1e8b3e4 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -90,7 +90,7 @@
 static int hns_roce_del_gid(const struct ib_gid_attr *attr, void **context)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(attr->device);
-	struct ib_gid_attr zattr = { };
+	struct ib_gid_attr zattr = {};
 	u8 port = attr->port_num - 1;
 	int ret;
 
@@ -111,7 +111,7 @@
 
 	netdev = hr_dev->iboe.netdevs[port];
 	if (!netdev) {
-		dev_err(dev, "port(%d) can't find netdev\n", port);
+		dev_err(dev, "Can't find netdev on port(%u)!\n", port);
 		return -ENODEV;
 	}
 
@@ -141,8 +141,8 @@
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 	struct hns_roce_ib_iboe *iboe = NULL;
 	struct hns_roce_dev *hr_dev = NULL;
-	u8 port = 0;
-	int ret = 0;
+	int ret;
+	u8 port;
 
 	hr_dev = container_of(self, struct hns_roce_dev, iboe.nb);
 	iboe = &hr_dev->iboe;
@@ -210,7 +210,7 @@
 	props->max_pkeys = 1;
 	props->local_ca_ack_delay = hr_dev->caps.local_ca_ack_delay;
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) {
-		props->max_srq = hr_dev->caps.max_srqs;
+		props->max_srq = hr_dev->caps.num_srqs;
 		props->max_srq_wr = hr_dev->caps.max_srq_wrs;
 		props->max_srq_sge = hr_dev->caps.max_srq_sges;
 	}
@@ -233,7 +233,6 @@
 	enum ib_mtu mtu;
 	u8 port;
 
-	assert(port_num > 0);
 	port = port_num - 1;
 
 	/* props being zeroed by the caller, avoid zeroing it here */
@@ -253,17 +252,18 @@
 	net_dev = hr_dev->iboe.netdevs[port];
 	if (!net_dev) {
 		spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
-		dev_err(dev, "find netdev %d failed!\r\n", port);
+		dev_err(dev, "Find netdev %u failed!\n", port);
 		return -EINVAL;
 	}
 
 	mtu = iboe_get_mtu(net_dev->mtu);
 	props->active_mtu = mtu ? min(props->max_mtu, mtu) : IB_MTU_256;
-	props->state = (netif_running(net_dev) && netif_carrier_ok(net_dev)) ?
-			IB_PORT_ACTIVE : IB_PORT_DOWN;
-	props->phys_state = (props->state == IB_PORT_ACTIVE) ?
-			     IB_PORT_PHYS_STATE_LINK_UP :
-			     IB_PORT_PHYS_STATE_DISABLED;
+	props->state = netif_running(net_dev) && netif_carrier_ok(net_dev) ?
+			       IB_PORT_ACTIVE :
+			       IB_PORT_DOWN;
+	props->phys_state = props->state == IB_PORT_ACTIVE ?
+				    IB_PORT_PHYS_STATE_LINK_UP :
+				    IB_PORT_PHYS_STATE_DISABLED;
 
 	spin_unlock_irqrestore(&hr_dev->iboe.lock, flags);
 
@@ -279,6 +279,9 @@
 static int hns_roce_query_pkey(struct ib_device *ib_dev, u8 port, u16 index,
 			       u16 *pkey)
 {
+	if (index > 0)
+		return -EINVAL;
+
 	*pkey = PKEY_ID;
 
 	return 0;
@@ -301,12 +304,6 @@
 	return 0;
 }
 
-static int hns_roce_modify_port(struct ib_device *ib_dev, u8 port_num, int mask,
-				struct ib_port_modify *props)
-{
-	return 0;
-}
-
 static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
 				   struct ib_udata *udata)
 {
@@ -329,7 +326,10 @@
 		mutex_init(&context->page_mutex);
 	}
 
-	ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	resp.cqe_size = hr_dev->caps.cqe_sz;
+
+	ret = ib_copy_to_udata(udata, &resp,
+			       min(udata->outlen, sizeof(resp)));
 	if (ret)
 		goto error_fail_copy_to_udata;
 
@@ -359,7 +359,8 @@
 		return rdma_user_mmap_io(context, vma,
 					 to_hr_ucontext(context)->uar.pfn,
 					 PAGE_SIZE,
-					 pgprot_noncached(vma->vm_page_prot));
+					 pgprot_device(vma->vm_page_prot),
+					 NULL);
 
 	/* vm_pgoff: 1 -- TPTR */
 	case 1:
@@ -372,7 +373,8 @@
 		return rdma_user_mmap_io(context, vma,
 					 hr_dev->tptr_dma_addr >> PAGE_SHIFT,
 					 hr_dev->tptr_size,
-					 vma->vm_page_prot);
+					 vma->vm_page_prot,
+					 NULL);
 
 	default:
 		return -EINVAL;
@@ -423,22 +425,21 @@
 	.alloc_pd = hns_roce_alloc_pd,
 	.alloc_ucontext = hns_roce_alloc_ucontext,
 	.create_ah = hns_roce_create_ah,
-	.create_cq = hns_roce_ib_create_cq,
+	.create_cq = hns_roce_create_cq,
 	.create_qp = hns_roce_create_qp,
 	.dealloc_pd = hns_roce_dealloc_pd,
 	.dealloc_ucontext = hns_roce_dealloc_ucontext,
 	.del_gid = hns_roce_del_gid,
 	.dereg_mr = hns_roce_dereg_mr,
 	.destroy_ah = hns_roce_destroy_ah,
-	.destroy_cq = hns_roce_ib_destroy_cq,
+	.destroy_cq = hns_roce_destroy_cq,
 	.disassociate_ucontext = hns_roce_disassociate_ucontext,
-	.fill_res_entry = hns_roce_fill_res_entry,
+	.fill_res_cq_entry = hns_roce_fill_res_cq_entry,
 	.get_dma_mr = hns_roce_get_dma_mr,
 	.get_link_layer = hns_roce_get_link_layer,
 	.get_port_immutable = hns_roce_port_immutable,
 	.mmap = hns_roce_mmap,
 	.modify_device = hns_roce_modify_device,
-	.modify_port = hns_roce_modify_port,
 	.modify_qp = hns_roce_modify_qp,
 	.query_ah = hns_roce_query_ah,
 	.query_device = hns_roce_query_device,
@@ -459,6 +460,8 @@
 static const struct ib_device_ops hns_roce_dev_mw_ops = {
 	.alloc_mw = hns_roce_alloc_mw,
 	.dealloc_mw = hns_roce_dealloc_mw,
+
+	INIT_RDMA_OBJ_SIZE(ib_mw, hns_roce_mw, ibmw),
 };
 
 static const struct ib_device_ops hns_roce_dev_frmr_ops = {
@@ -486,13 +489,13 @@
 
 	ib_dev = &hr_dev->ib_dev;
 
-	ib_dev->node_type		= RDMA_NODE_IB_CA;
-	ib_dev->dev.parent		= dev;
+	ib_dev->node_type = RDMA_NODE_IB_CA;
+	ib_dev->dev.parent = dev;
 
-	ib_dev->phys_port_cnt		= hr_dev->caps.num_ports;
-	ib_dev->local_dma_lkey		= hr_dev->caps.reserved_lkey;
-	ib_dev->num_comp_vectors	= hr_dev->caps.num_comp_vectors;
-	ib_dev->uverbs_cmd_mask		=
+	ib_dev->phys_port_cnt = hr_dev->caps.num_ports;
+	ib_dev->local_dma_lkey = hr_dev->caps.reserved_lkey;
+	ib_dev->num_comp_vectors = hr_dev->caps.num_comp_vectors;
+	ib_dev->uverbs_cmd_mask =
 		(1ULL << IB_USER_VERBS_CMD_GET_CONTEXT) |
 		(1ULL << IB_USER_VERBS_CMD_QUERY_DEVICE) |
 		(1ULL << IB_USER_VERBS_CMD_QUERY_PORT) |
@@ -508,8 +511,7 @@
 		(1ULL << IB_USER_VERBS_CMD_QUERY_QP) |
 		(1ULL << IB_USER_VERBS_CMD_DESTROY_QP);
 
-	ib_dev->uverbs_ex_cmd_mask |=
-		(1ULL << IB_USER_VERBS_EX_CMD_MODIFY_CQ);
+	ib_dev->uverbs_ex_cmd_mask |= (1ULL << IB_USER_VERBS_EX_CMD_MODIFY_CQ);
 
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_REREG_MR) {
 		ib_dev->uverbs_cmd_mask |= (1ULL << IB_USER_VERBS_CMD_REREG_MR);
@@ -551,7 +553,8 @@
 		if (ret)
 			return ret;
 	}
-	ret = ib_register_device(ib_dev, "hns_%d");
+	dma_set_max_seg_size(dev, UINT_MAX);
+	ret = ib_register_device(ib_dev, "hns_%d", dev);
 	if (ret) {
 		dev_err(dev, "ib_register_device failed!\n");
 		return ret;
@@ -584,35 +587,16 @@
 	int ret;
 	struct device *dev = hr_dev->dev;
 
-	ret = hns_roce_init_hem_table(hr_dev, &hr_dev->mr_table.mtt_table,
-				      HEM_TYPE_MTT, hr_dev->caps.mtt_entry_sz,
-				      hr_dev->caps.num_mtt_segs, 1);
-	if (ret) {
-		dev_err(dev, "Failed to init MTT context memory, aborting.\n");
-		return ret;
-	}
-
-	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE)) {
-		ret = hns_roce_init_hem_table(hr_dev,
-				      &hr_dev->mr_table.mtt_cqe_table,
-				      HEM_TYPE_CQE, hr_dev->caps.mtt_entry_sz,
-				      hr_dev->caps.num_cqe_segs, 1);
-		if (ret) {
-			dev_err(dev, "Failed to init MTT CQE context memory, aborting.\n");
-			goto err_unmap_cqe;
-		}
-	}
-
 	ret = hns_roce_init_hem_table(hr_dev, &hr_dev->mr_table.mtpt_table,
 				      HEM_TYPE_MTPT, hr_dev->caps.mtpt_entry_sz,
 				      hr_dev->caps.num_mtpts, 1);
 	if (ret) {
 		dev_err(dev, "Failed to init MTPT context memory, aborting.\n");
-		goto err_unmap_mtt;
+		return ret;
 	}
 
 	ret = hns_roce_init_hem_table(hr_dev, &hr_dev->qp_table.qp_table,
-				      HEM_TYPE_QPC, hr_dev->caps.qpc_entry_sz,
+				      HEM_TYPE_QPC, hr_dev->caps.qpc_sz,
 				      hr_dev->caps.num_qps, 1);
 	if (ret) {
 		dev_err(dev, "Failed to init QP context memory, aborting.\n");
@@ -638,7 +622,7 @@
 					      hr_dev->caps.num_qps, 1);
 		if (ret) {
 			dev_err(dev,
-			       "Failed to init trrl_table memory, aborting.\n");
+				"Failed to init trrl_table memory, aborting.\n");
 			goto err_unmap_irrl;
 		}
 	}
@@ -651,79 +635,51 @@
 		goto err_unmap_trrl;
 	}
 
-	if (hr_dev->caps.srqc_entry_sz) {
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ) {
 		ret = hns_roce_init_hem_table(hr_dev, &hr_dev->srq_table.table,
 					      HEM_TYPE_SRQC,
 					      hr_dev->caps.srqc_entry_sz,
 					      hr_dev->caps.num_srqs, 1);
 		if (ret) {
 			dev_err(dev,
-			      "Failed to init SRQ context memory, aborting.\n");
+				"Failed to init SRQ context memory, aborting.\n");
 			goto err_unmap_cq;
 		}
 	}
 
-	if (hr_dev->caps.num_srqwqe_segs) {
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL) {
 		ret = hns_roce_init_hem_table(hr_dev,
-					     &hr_dev->mr_table.mtt_srqwqe_table,
-					     HEM_TYPE_SRQWQE,
-					     hr_dev->caps.mtt_entry_sz,
-					     hr_dev->caps.num_srqwqe_segs, 1);
+					      &hr_dev->qp_table.sccc_table,
+					      HEM_TYPE_SCCC,
+					      hr_dev->caps.sccc_sz,
+					      hr_dev->caps.num_qps, 1);
 		if (ret) {
 			dev_err(dev,
-				"Failed to init MTT srqwqe memory, aborting.\n");
+				"Failed to init SCC context memory, aborting.\n");
 			goto err_unmap_srq;
 		}
 	}
 
-	if (hr_dev->caps.num_idx_segs) {
-		ret = hns_roce_init_hem_table(hr_dev,
-					      &hr_dev->mr_table.mtt_idx_table,
-					      HEM_TYPE_IDX,
-					      hr_dev->caps.idx_entry_sz,
-					      hr_dev->caps.num_idx_segs, 1);
-		if (ret) {
-			dev_err(dev,
-				"Failed to init MTT idx memory, aborting.\n");
-			goto err_unmap_srqwqe;
-		}
-	}
-
-	if (hr_dev->caps.sccc_entry_sz) {
-		ret = hns_roce_init_hem_table(hr_dev,
-					      &hr_dev->qp_table.sccc_table,
-					      HEM_TYPE_SCCC,
-					      hr_dev->caps.sccc_entry_sz,
-					      hr_dev->caps.num_qps, 1);
-		if (ret) {
-			dev_err(dev,
-			      "Failed to init SCC context memory, aborting.\n");
-			goto err_unmap_idx;
-		}
-	}
-
 	if (hr_dev->caps.qpc_timer_entry_sz) {
-		ret = hns_roce_init_hem_table(hr_dev,
-					      &hr_dev->qpc_timer_table,
+		ret = hns_roce_init_hem_table(hr_dev, &hr_dev->qpc_timer_table,
 					      HEM_TYPE_QPC_TIMER,
 					      hr_dev->caps.qpc_timer_entry_sz,
 					      hr_dev->caps.num_qpc_timer, 1);
 		if (ret) {
 			dev_err(dev,
-			      "Failed to init QPC timer memory, aborting.\n");
+				"Failed to init QPC timer memory, aborting.\n");
 			goto err_unmap_ctx;
 		}
 	}
 
 	if (hr_dev->caps.cqc_timer_entry_sz) {
-		ret = hns_roce_init_hem_table(hr_dev,
-					      &hr_dev->cqc_timer_table,
+		ret = hns_roce_init_hem_table(hr_dev, &hr_dev->cqc_timer_table,
 					      HEM_TYPE_CQC_TIMER,
 					      hr_dev->caps.cqc_timer_entry_sz,
 					      hr_dev->caps.num_cqc_timer, 1);
 		if (ret) {
 			dev_err(dev,
-			      "Failed to init CQC timer memory, aborting.\n");
+				"Failed to init CQC timer memory, aborting.\n");
 			goto err_unmap_qpc_timer;
 		}
 	}
@@ -732,26 +688,14 @@
 
 err_unmap_qpc_timer:
 	if (hr_dev->caps.qpc_timer_entry_sz)
-		hns_roce_cleanup_hem_table(hr_dev,
-					   &hr_dev->qpc_timer_table);
+		hns_roce_cleanup_hem_table(hr_dev, &hr_dev->qpc_timer_table);
 
 err_unmap_ctx:
-	if (hr_dev->caps.sccc_entry_sz)
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL)
 		hns_roce_cleanup_hem_table(hr_dev,
 					   &hr_dev->qp_table.sccc_table);
-
-err_unmap_idx:
-	if (hr_dev->caps.num_idx_segs)
-		hns_roce_cleanup_hem_table(hr_dev,
-					   &hr_dev->mr_table.mtt_idx_table);
-
-err_unmap_srqwqe:
-	if (hr_dev->caps.num_srqwqe_segs)
-		hns_roce_cleanup_hem_table(hr_dev,
-					   &hr_dev->mr_table.mtt_srqwqe_table);
-
 err_unmap_srq:
-	if (hr_dev->caps.srqc_entry_sz)
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SRQ)
 		hns_roce_cleanup_hem_table(hr_dev, &hr_dev->srq_table.table);
 
 err_unmap_cq:
@@ -771,14 +715,6 @@
 err_unmap_dmpt:
 	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtpt_table);
 
-err_unmap_mtt:
-	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
-		hns_roce_cleanup_hem_table(hr_dev,
-					   &hr_dev->mr_table.mtt_cqe_table);
-
-err_unmap_cqe:
-	hns_roce_cleanup_hem_table(hr_dev, &hr_dev->mr_table.mtt_table);
-
 	return ret;
 }
 
@@ -867,6 +803,50 @@
 	return ret;
 }
 
+static void check_and_get_armed_cq(struct list_head *cq_list, struct ib_cq *cq)
+{
+	struct hns_roce_cq *hr_cq = to_hr_cq(cq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&hr_cq->lock, flags);
+	if (cq->comp_handler) {
+		if (!hr_cq->is_armed) {
+			hr_cq->is_armed = 1;
+			list_add_tail(&hr_cq->node, cq_list);
+		}
+	}
+	spin_unlock_irqrestore(&hr_cq->lock, flags);
+}
+
+void hns_roce_handle_device_err(struct hns_roce_dev *hr_dev)
+{
+	struct hns_roce_qp *hr_qp;
+	struct hns_roce_cq *hr_cq;
+	struct list_head cq_list;
+	unsigned long flags_qp;
+	unsigned long flags;
+
+	INIT_LIST_HEAD(&cq_list);
+
+	spin_lock_irqsave(&hr_dev->qp_list_lock, flags);
+	list_for_each_entry(hr_qp, &hr_dev->qp_list, node) {
+		spin_lock_irqsave(&hr_qp->sq.lock, flags_qp);
+		if (hr_qp->sq.tail != hr_qp->sq.head)
+			check_and_get_armed_cq(&cq_list, hr_qp->ibqp.send_cq);
+		spin_unlock_irqrestore(&hr_qp->sq.lock, flags_qp);
+
+		spin_lock_irqsave(&hr_qp->rq.lock, flags_qp);
+		if ((!hr_qp->ibqp.srq) && (hr_qp->rq.tail != hr_qp->rq.head))
+			check_and_get_armed_cq(&cq_list, hr_qp->ibqp.recv_cq);
+		spin_unlock_irqrestore(&hr_qp->rq.lock, flags_qp);
+	}
+
+	list_for_each_entry(hr_cq, &cq_list, node)
+		hns_roce_cq_completion(hr_dev, hr_cq->cqn);
+
+	spin_unlock_irqrestore(&hr_dev->qp_list_lock, flags);
+}
+
 int hns_roce_init(struct hns_roce_dev *hr_dev)
 {
 	int ret;
@@ -937,6 +917,9 @@
 		}
 	}
 
+	INIT_LIST_HEAD(&hr_dev->qp_list);
+	spin_lock_init(&hr_dev->qp_list_lock);
+
 	ret = hns_roce_register_device(hr_dev);
 	if (ret)
 		goto error_failed_register_device;
diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c
index 702b59f..027ec84 100644
--- a/drivers/infiniband/hw/hns/hns_roce_mr.c
+++ b/drivers/infiniband/hw/hns/hns_roce_mr.c
@@ -48,689 +48,120 @@
 	return (key << 24) | (key >> 8);
 }
 
-static int hns_roce_sw2hw_mpt(struct hns_roce_dev *hr_dev,
-			      struct hns_roce_cmd_mailbox *mailbox,
-			      unsigned long mpt_index)
+static int hns_roce_hw_create_mpt(struct hns_roce_dev *hr_dev,
+				  struct hns_roce_cmd_mailbox *mailbox,
+				  unsigned long mpt_index)
 {
 	return hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, mpt_index, 0,
-				 HNS_ROCE_CMD_SW2HW_MPT,
+				 HNS_ROCE_CMD_CREATE_MPT,
 				 HNS_ROCE_CMD_TIMEOUT_MSECS);
 }
 
-int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev,
-			      struct hns_roce_cmd_mailbox *mailbox,
-			      unsigned long mpt_index)
+int hns_roce_hw_destroy_mpt(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_cmd_mailbox *mailbox,
+			    unsigned long mpt_index)
 {
 	return hns_roce_cmd_mbox(hr_dev, 0, mailbox ? mailbox->dma : 0,
-				 mpt_index, !mailbox, HNS_ROCE_CMD_HW2SW_MPT,
+				 mpt_index, !mailbox, HNS_ROCE_CMD_DESTROY_MPT,
 				 HNS_ROCE_CMD_TIMEOUT_MSECS);
 }
 
-static int hns_roce_buddy_alloc(struct hns_roce_buddy *buddy, int order,
-				unsigned long *seg)
+static int alloc_mr_key(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr,
+			u32 pd, u64 iova, u64 size, u32 access)
 {
-	int o;
-	u32 m;
-
-	spin_lock(&buddy->lock);
-
-	for (o = order; o <= buddy->max_order; ++o) {
-		if (buddy->num_free[o]) {
-			m = 1 << (buddy->max_order - o);
-			*seg = find_first_bit(buddy->bits[o], m);
-			if (*seg < m)
-				goto found;
-		}
-	}
-	spin_unlock(&buddy->lock);
-	return -1;
-
- found:
-	clear_bit(*seg, buddy->bits[o]);
-	--buddy->num_free[o];
-
-	while (o > order) {
-		--o;
-		*seg <<= 1;
-		set_bit(*seg ^ 1, buddy->bits[o]);
-		++buddy->num_free[o];
-	}
-
-	spin_unlock(&buddy->lock);
-
-	*seg <<= order;
-	return 0;
-}
-
-static void hns_roce_buddy_free(struct hns_roce_buddy *buddy, unsigned long seg,
-				int order)
-{
-	seg >>= order;
-
-	spin_lock(&buddy->lock);
-
-	while (test_bit(seg ^ 1, buddy->bits[order])) {
-		clear_bit(seg ^ 1, buddy->bits[order]);
-		--buddy->num_free[order];
-		seg >>= 1;
-		++order;
-	}
-
-	set_bit(seg, buddy->bits[order]);
-	++buddy->num_free[order];
-
-	spin_unlock(&buddy->lock);
-}
-
-static int hns_roce_buddy_init(struct hns_roce_buddy *buddy, int max_order)
-{
-	int i, s;
-
-	buddy->max_order = max_order;
-	spin_lock_init(&buddy->lock);
-	buddy->bits = kcalloc(buddy->max_order + 1,
-			      sizeof(*buddy->bits),
-			      GFP_KERNEL);
-	buddy->num_free = kcalloc(buddy->max_order + 1,
-				  sizeof(*buddy->num_free),
-				  GFP_KERNEL);
-	if (!buddy->bits || !buddy->num_free)
-		goto err_out;
-
-	for (i = 0; i <= buddy->max_order; ++i) {
-		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
-		buddy->bits[i] = kcalloc(s, sizeof(long), GFP_KERNEL |
-					 __GFP_NOWARN);
-		if (!buddy->bits[i]) {
-			buddy->bits[i] = vzalloc(array_size(s, sizeof(long)));
-			if (!buddy->bits[i])
-				goto err_out_free;
-		}
-	}
-
-	set_bit(0, buddy->bits[buddy->max_order]);
-	buddy->num_free[buddy->max_order] = 1;
-
-	return 0;
-
-err_out_free:
-	for (i = 0; i <= buddy->max_order; ++i)
-		kvfree(buddy->bits[i]);
-
-err_out:
-	kfree(buddy->bits);
-	kfree(buddy->num_free);
-	return -ENOMEM;
-}
-
-static void hns_roce_buddy_cleanup(struct hns_roce_buddy *buddy)
-{
-	int i;
-
-	for (i = 0; i <= buddy->max_order; ++i)
-		kvfree(buddy->bits[i]);
-
-	kfree(buddy->bits);
-	kfree(buddy->num_free);
-}
-
-static int hns_roce_alloc_mtt_range(struct hns_roce_dev *hr_dev, int order,
-				    unsigned long *seg, u32 mtt_type)
-{
-	struct hns_roce_mr_table *mr_table = &hr_dev->mr_table;
-	struct hns_roce_hem_table *table;
-	struct hns_roce_buddy *buddy;
-	int ret;
-
-	switch (mtt_type) {
-	case MTT_TYPE_WQE:
-		buddy = &mr_table->mtt_buddy;
-		table = &mr_table->mtt_table;
-		break;
-	case MTT_TYPE_CQE:
-		buddy = &mr_table->mtt_cqe_buddy;
-		table = &mr_table->mtt_cqe_table;
-		break;
-	case MTT_TYPE_SRQWQE:
-		buddy = &mr_table->mtt_srqwqe_buddy;
-		table = &mr_table->mtt_srqwqe_table;
-		break;
-	case MTT_TYPE_IDX:
-		buddy = &mr_table->mtt_idx_buddy;
-		table = &mr_table->mtt_idx_table;
-		break;
-	default:
-		dev_err(hr_dev->dev, "Unsupport MTT table type: %d\n",
-			mtt_type);
-		return -EINVAL;
-	}
-
-	ret = hns_roce_buddy_alloc(buddy, order, seg);
-	if (ret == -1)
-		return -1;
-
-	if (hns_roce_table_get_range(hr_dev, table, *seg,
-				     *seg + (1 << order) - 1)) {
-		hns_roce_buddy_free(buddy, *seg, order);
-		return -1;
-	}
-
-	return 0;
-}
-
-int hns_roce_mtt_init(struct hns_roce_dev *hr_dev, int npages, int page_shift,
-		      struct hns_roce_mtt *mtt)
-{
-	int ret;
-	int i;
-
-	/* Page num is zero, correspond to DMA memory register */
-	if (!npages) {
-		mtt->order = -1;
-		mtt->page_shift = HNS_ROCE_HEM_PAGE_SHIFT;
-		return 0;
-	}
-
-	/* Note: if page_shift is zero, FAST memory register */
-	mtt->page_shift = page_shift;
-
-	/* Compute MTT entry necessary */
-	for (mtt->order = 0, i = HNS_ROCE_MTT_ENTRY_PER_SEG; i < npages;
-	     i <<= 1)
-		++mtt->order;
-
-	/* Allocate MTT entry */
-	ret = hns_roce_alloc_mtt_range(hr_dev, mtt->order, &mtt->first_seg,
-				       mtt->mtt_type);
-	if (ret == -1)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void hns_roce_mtt_cleanup(struct hns_roce_dev *hr_dev, struct hns_roce_mtt *mtt)
-{
-	struct hns_roce_mr_table *mr_table = &hr_dev->mr_table;
-
-	if (mtt->order < 0)
-		return;
-
-	switch (mtt->mtt_type) {
-	case MTT_TYPE_WQE:
-		hns_roce_buddy_free(&mr_table->mtt_buddy, mtt->first_seg,
-				    mtt->order);
-		hns_roce_table_put_range(hr_dev, &mr_table->mtt_table,
-					mtt->first_seg,
-					mtt->first_seg + (1 << mtt->order) - 1);
-		break;
-	case MTT_TYPE_CQE:
-		hns_roce_buddy_free(&mr_table->mtt_cqe_buddy, mtt->first_seg,
-				    mtt->order);
-		hns_roce_table_put_range(hr_dev, &mr_table->mtt_cqe_table,
-					mtt->first_seg,
-					mtt->first_seg + (1 << mtt->order) - 1);
-		break;
-	case MTT_TYPE_SRQWQE:
-		hns_roce_buddy_free(&mr_table->mtt_srqwqe_buddy, mtt->first_seg,
-				    mtt->order);
-		hns_roce_table_put_range(hr_dev, &mr_table->mtt_srqwqe_table,
-					mtt->first_seg,
-					mtt->first_seg + (1 << mtt->order) - 1);
-		break;
-	case MTT_TYPE_IDX:
-		hns_roce_buddy_free(&mr_table->mtt_idx_buddy, mtt->first_seg,
-				    mtt->order);
-		hns_roce_table_put_range(hr_dev, &mr_table->mtt_idx_table,
-					mtt->first_seg,
-					mtt->first_seg + (1 << mtt->order) - 1);
-		break;
-	default:
-		dev_err(hr_dev->dev,
-			"Unsupport mtt type %d, clean mtt failed\n",
-			mtt->mtt_type);
-		break;
-	}
-}
-
-static void hns_roce_loop_free(struct hns_roce_dev *hr_dev,
-			       struct hns_roce_mr *mr, int err_loop_index,
-			       int loop_i, int loop_j)
-{
-	struct device *dev = hr_dev->dev;
-	u32 mhop_num;
-	u32 pbl_bt_sz;
-	u64 bt_idx;
-	int i, j;
-
-	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
-	mhop_num = hr_dev->caps.pbl_hop_num;
-
-	i = loop_i;
-	if (mhop_num == 3 && err_loop_index == 2) {
-		for (; i >= 0; i--) {
-			dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
-					  mr->pbl_l1_dma_addr[i]);
-
-			for (j = 0; j < pbl_bt_sz / BA_BYTE_LEN; j++) {
-				if (i == loop_i && j >= loop_j)
-					break;
-
-				bt_idx = i * pbl_bt_sz / BA_BYTE_LEN + j;
-				dma_free_coherent(dev, pbl_bt_sz,
-						  mr->pbl_bt_l2[bt_idx],
-						  mr->pbl_l2_dma_addr[bt_idx]);
-			}
-		}
-	} else if (mhop_num == 3 && err_loop_index == 1) {
-		for (i -= 1; i >= 0; i--) {
-			dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
-					  mr->pbl_l1_dma_addr[i]);
-
-			for (j = 0; j < pbl_bt_sz / BA_BYTE_LEN; j++) {
-				bt_idx = i * pbl_bt_sz / BA_BYTE_LEN + j;
-				dma_free_coherent(dev, pbl_bt_sz,
-						  mr->pbl_bt_l2[bt_idx],
-						  mr->pbl_l2_dma_addr[bt_idx]);
-			}
-		}
-	} else if (mhop_num == 2 && err_loop_index == 1) {
-		for (i -= 1; i >= 0; i--)
-			dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
-					  mr->pbl_l1_dma_addr[i]);
-	} else {
-		dev_warn(dev, "not support: mhop_num=%d, err_loop_index=%d.",
-			 mhop_num, err_loop_index);
-		return;
-	}
-
-	dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l0, mr->pbl_l0_dma_addr);
-	mr->pbl_bt_l0 = NULL;
-	mr->pbl_l0_dma_addr = 0;
-}
-static int pbl_1hop_alloc(struct hns_roce_dev *hr_dev, int npages,
-			       struct hns_roce_mr *mr, u32 pbl_bt_sz)
-{
-	struct device *dev = hr_dev->dev;
-
-	if (npages > pbl_bt_sz / 8) {
-		dev_err(dev, "npages %d is larger than buf_pg_sz!",
-			npages);
-		return -EINVAL;
-	}
-	mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
-					 &(mr->pbl_dma_addr),
-					 GFP_KERNEL);
-	if (!mr->pbl_buf)
-		return -ENOMEM;
-
-	mr->pbl_size = npages;
-	mr->pbl_ba = mr->pbl_dma_addr;
-	mr->pbl_hop_num = 1;
-	mr->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz;
-	mr->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz;
-	return 0;
-
-}
-
-
-static int pbl_2hop_alloc(struct hns_roce_dev *hr_dev, int npages,
-			       struct hns_roce_mr *mr, u32 pbl_bt_sz)
-{
-	struct device *dev = hr_dev->dev;
-	int npages_allocated;
-	u64 pbl_last_bt_num;
-	u64 pbl_bt_cnt = 0;
-	u64 size;
-	int i;
-
-	pbl_last_bt_num = (npages + pbl_bt_sz / 8 - 1) / (pbl_bt_sz / 8);
-
-	/* alloc L1 BT */
-	for (i = 0; i < pbl_bt_sz / 8; i++) {
-		if (pbl_bt_cnt + 1 < pbl_last_bt_num) {
-			size = pbl_bt_sz;
-		} else {
-			npages_allocated = i * (pbl_bt_sz / 8);
-			size = (npages - npages_allocated) * 8;
-		}
-		mr->pbl_bt_l1[i] = dma_alloc_coherent(dev, size,
-					    &(mr->pbl_l1_dma_addr[i]),
-					    GFP_KERNEL);
-		if (!mr->pbl_bt_l1[i]) {
-			hns_roce_loop_free(hr_dev, mr, 1, i, 0);
-			return -ENOMEM;
-		}
-
-		*(mr->pbl_bt_l0 + i) = mr->pbl_l1_dma_addr[i];
-
-		pbl_bt_cnt++;
-		if (pbl_bt_cnt >= pbl_last_bt_num)
-			break;
-	}
-
-	mr->l0_chunk_last_num = i + 1;
-
-	return 0;
-}
-
-static int pbl_3hop_alloc(struct hns_roce_dev *hr_dev, int npages,
-			       struct hns_roce_mr *mr, u32 pbl_bt_sz)
-{
-	struct device *dev = hr_dev->dev;
-	int mr_alloc_done = 0;
-	int npages_allocated;
-	u64 pbl_last_bt_num;
-	u64 pbl_bt_cnt = 0;
-	u64 bt_idx;
-	u64 size;
-	int i;
-	int j = 0;
-
-	pbl_last_bt_num = (npages + pbl_bt_sz / 8 - 1) / (pbl_bt_sz / 8);
-
-	mr->pbl_l2_dma_addr = kcalloc(pbl_last_bt_num,
-				      sizeof(*mr->pbl_l2_dma_addr),
-				      GFP_KERNEL);
-	if (!mr->pbl_l2_dma_addr)
-		return -ENOMEM;
-
-	mr->pbl_bt_l2 = kcalloc(pbl_last_bt_num,
-				sizeof(*mr->pbl_bt_l2),
-				GFP_KERNEL);
-	if (!mr->pbl_bt_l2)
-		goto err_kcalloc_bt_l2;
-
-	/* alloc L1, L2 BT */
-	for (i = 0; i < pbl_bt_sz / 8; i++) {
-		mr->pbl_bt_l1[i] = dma_alloc_coherent(dev, pbl_bt_sz,
-					    &(mr->pbl_l1_dma_addr[i]),
-					    GFP_KERNEL);
-		if (!mr->pbl_bt_l1[i]) {
-			hns_roce_loop_free(hr_dev, mr, 1, i, 0);
-			goto err_dma_alloc_l0;
-		}
-
-		*(mr->pbl_bt_l0 + i) = mr->pbl_l1_dma_addr[i];
-
-		for (j = 0; j < pbl_bt_sz / 8; j++) {
-			bt_idx = i * pbl_bt_sz / 8 + j;
-
-			if (pbl_bt_cnt + 1 < pbl_last_bt_num) {
-				size = pbl_bt_sz;
-			} else {
-				npages_allocated = bt_idx *
-						   (pbl_bt_sz / 8);
-				size = (npages - npages_allocated) * 8;
-			}
-			mr->pbl_bt_l2[bt_idx] = dma_alloc_coherent(
-				      dev, size,
-				      &(mr->pbl_l2_dma_addr[bt_idx]),
-				      GFP_KERNEL);
-			if (!mr->pbl_bt_l2[bt_idx]) {
-				hns_roce_loop_free(hr_dev, mr, 2, i, j);
-				goto err_dma_alloc_l0;
-			}
-
-			*(mr->pbl_bt_l1[i] + j) =
-					mr->pbl_l2_dma_addr[bt_idx];
-
-			pbl_bt_cnt++;
-			if (pbl_bt_cnt >= pbl_last_bt_num) {
-				mr_alloc_done = 1;
-				break;
-			}
-		}
-
-		if (mr_alloc_done)
-			break;
-	}
-
-	mr->l0_chunk_last_num = i + 1;
-	mr->l1_chunk_last_num = j + 1;
-
-
-	return 0;
-
-err_dma_alloc_l0:
-	kfree(mr->pbl_bt_l2);
-	mr->pbl_bt_l2 = NULL;
-
-err_kcalloc_bt_l2:
-	kfree(mr->pbl_l2_dma_addr);
-	mr->pbl_l2_dma_addr = NULL;
-
-	return -ENOMEM;
-}
-
-
-/* PBL multi hop addressing */
-static int hns_roce_mhop_alloc(struct hns_roce_dev *hr_dev, int npages,
-			       struct hns_roce_mr *mr)
-{
-	struct device *dev = hr_dev->dev;
-	u32 pbl_bt_sz;
-	u32 mhop_num;
-
-	mhop_num = (mr->type == MR_TYPE_FRMR ? 1 : hr_dev->caps.pbl_hop_num);
-	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
-
-	if (mhop_num == HNS_ROCE_HOP_NUM_0)
-		return 0;
-
-	if (mhop_num == 1)
-		return pbl_1hop_alloc(hr_dev, npages, mr, pbl_bt_sz);
-
-	mr->pbl_l1_dma_addr = kcalloc(pbl_bt_sz / 8,
-				      sizeof(*mr->pbl_l1_dma_addr),
-				      GFP_KERNEL);
-	if (!mr->pbl_l1_dma_addr)
-		return -ENOMEM;
-
-	mr->pbl_bt_l1 = kcalloc(pbl_bt_sz / 8, sizeof(*mr->pbl_bt_l1),
-				GFP_KERNEL);
-	if (!mr->pbl_bt_l1)
-		goto err_kcalloc_bt_l1;
-
-	/* alloc L0 BT */
-	mr->pbl_bt_l0 = dma_alloc_coherent(dev, pbl_bt_sz,
-					   &(mr->pbl_l0_dma_addr),
-					   GFP_KERNEL);
-	if (!mr->pbl_bt_l0)
-		goto err_kcalloc_l2_dma;
-
-	if (mhop_num == 2) {
-		if (pbl_2hop_alloc(hr_dev, npages, mr, pbl_bt_sz))
-			goto err_kcalloc_l2_dma;
-	}
-
-	if (mhop_num == 3) {
-		if (pbl_3hop_alloc(hr_dev, npages, mr, pbl_bt_sz))
-			goto err_kcalloc_l2_dma;
-	}
-
-
-	mr->pbl_size = npages;
-	mr->pbl_ba = mr->pbl_l0_dma_addr;
-	mr->pbl_hop_num = hr_dev->caps.pbl_hop_num;
-	mr->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz;
-	mr->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz;
-
-	return 0;
-
-err_kcalloc_l2_dma:
-	kfree(mr->pbl_bt_l1);
-	mr->pbl_bt_l1 = NULL;
-
-err_kcalloc_bt_l1:
-	kfree(mr->pbl_l1_dma_addr);
-	mr->pbl_l1_dma_addr = NULL;
-
-	return -ENOMEM;
-}
-
-static int hns_roce_mr_alloc(struct hns_roce_dev *hr_dev, u32 pd, u64 iova,
-			     u64 size, u32 access, int npages,
-			     struct hns_roce_mr *mr)
-{
-	struct device *dev = hr_dev->dev;
-	unsigned long index = 0;
-	int ret;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	unsigned long obj = 0;
+	int err;
 
 	/* Allocate a key for mr from mr_table */
-	ret = hns_roce_bitmap_alloc(&hr_dev->mr_table.mtpt_bitmap, &index);
-	if (ret == -1)
+	err = hns_roce_bitmap_alloc(&hr_dev->mr_table.mtpt_bitmap, &obj);
+	if (err) {
+		ibdev_err(ibdev,
+			  "failed to alloc bitmap for MR key, ret = %d.\n",
+			  err);
 		return -ENOMEM;
+	}
 
 	mr->iova = iova;			/* MR va starting addr */
 	mr->size = size;			/* MR addr range */
 	mr->pd = pd;				/* MR num */
 	mr->access = access;			/* MR access permit */
 	mr->enabled = 0;			/* MR active status */
-	mr->key = hw_index_to_key(index);	/* MR key */
+	mr->key = hw_index_to_key(obj);		/* MR key */
 
-	if (size == ~0ull) {
-		mr->pbl_buf = NULL;
-		mr->pbl_dma_addr = 0;
-		/* PBL multi-hop addressing parameters */
-		mr->pbl_bt_l2 = NULL;
-		mr->pbl_bt_l1 = NULL;
-		mr->pbl_bt_l0 = NULL;
-		mr->pbl_l2_dma_addr = NULL;
-		mr->pbl_l1_dma_addr = NULL;
-		mr->pbl_l0_dma_addr = 0;
-	} else {
-		if (!hr_dev->caps.pbl_hop_num) {
-			mr->pbl_buf = dma_alloc_coherent(dev,
-							 npages * BA_BYTE_LEN,
-							 &(mr->pbl_dma_addr),
-							 GFP_KERNEL);
-			if (!mr->pbl_buf)
-				return -ENOMEM;
-		} else {
-			ret = hns_roce_mhop_alloc(hr_dev, npages, mr);
-		}
+	err = hns_roce_table_get(hr_dev, &hr_dev->mr_table.mtpt_table, obj);
+	if (err) {
+		ibdev_err(ibdev, "failed to alloc mtpt, ret = %d.\n", err);
+		goto err_free_bitmap;
 	}
 
-	return ret;
+	return 0;
+err_free_bitmap:
+	hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap, obj, BITMAP_NO_RR);
+	return err;
 }
 
-static void hns_roce_mhop_free(struct hns_roce_dev *hr_dev,
-			       struct hns_roce_mr *mr)
+static void free_mr_key(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr)
 {
-	struct device *dev = hr_dev->dev;
-	int npages_allocated;
-	int npages;
-	int i, j;
-	u32 pbl_bt_sz;
-	u32 mhop_num;
-	u64 bt_idx;
+	unsigned long obj = key_to_hw_index(mr->key);
 
-	npages = mr->pbl_size;
-	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
-	mhop_num = (mr->type == MR_TYPE_FRMR) ? 1 : hr_dev->caps.pbl_hop_num;
+	hns_roce_table_put(hr_dev, &hr_dev->mr_table.mtpt_table, obj);
+	hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap, obj, BITMAP_NO_RR);
+}
 
-	if (mhop_num == HNS_ROCE_HOP_NUM_0)
-		return;
+static int alloc_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr,
+			size_t length, struct ib_udata *udata, u64 start,
+			int access)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	bool is_fast = mr->type == MR_TYPE_FRMR;
+	struct hns_roce_buf_attr buf_attr = {};
+	int err;
 
-	if (mhop_num == 1) {
-		dma_free_coherent(dev, (unsigned int)(npages * BA_BYTE_LEN),
-				  mr->pbl_buf, mr->pbl_dma_addr);
-		return;
-	}
+	mr->pbl_hop_num = is_fast ? 1 : hr_dev->caps.pbl_hop_num;
+	buf_attr.page_shift = is_fast ? PAGE_SHIFT :
+			      hr_dev->caps.pbl_buf_pg_sz + PAGE_SHIFT;
+	buf_attr.region[0].size = length;
+	buf_attr.region[0].hopnum = mr->pbl_hop_num;
+	buf_attr.region_count = 1;
+	buf_attr.fixed_page = true;
+	buf_attr.user_access = access;
+	/* fast MR's buffer is alloced before mapping, not at creation */
+	buf_attr.mtt_only = is_fast;
 
-	dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l0,
-			  mr->pbl_l0_dma_addr);
+	err = hns_roce_mtr_create(hr_dev, &mr->pbl_mtr, &buf_attr,
+				  hr_dev->caps.pbl_ba_pg_sz + HNS_HW_PAGE_SHIFT,
+				  udata, start);
+	if (err)
+		ibdev_err(ibdev, "failed to alloc pbl mtr, ret = %d.\n", err);
+	else
+		mr->npages = mr->pbl_mtr.hem_cfg.buf_pg_count;
 
-	if (mhop_num == 2) {
-		for (i = 0; i < mr->l0_chunk_last_num; i++) {
-			if (i == mr->l0_chunk_last_num - 1) {
-				npages_allocated =
-						i * (pbl_bt_sz / BA_BYTE_LEN);
+	return err;
+}
 
-				dma_free_coherent(dev,
-				      (npages - npages_allocated) * BA_BYTE_LEN,
-				       mr->pbl_bt_l1[i],
-				       mr->pbl_l1_dma_addr[i]);
-
-				break;
-			}
-
-			dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
-					  mr->pbl_l1_dma_addr[i]);
-		}
-	} else if (mhop_num == 3) {
-		for (i = 0; i < mr->l0_chunk_last_num; i++) {
-			dma_free_coherent(dev, pbl_bt_sz, mr->pbl_bt_l1[i],
-					  mr->pbl_l1_dma_addr[i]);
-
-			for (j = 0; j < pbl_bt_sz / BA_BYTE_LEN; j++) {
-				bt_idx = i * (pbl_bt_sz / BA_BYTE_LEN) + j;
-
-				if ((i == mr->l0_chunk_last_num - 1)
-				    && j == mr->l1_chunk_last_num - 1) {
-					npages_allocated = bt_idx *
-						      (pbl_bt_sz / BA_BYTE_LEN);
-
-					dma_free_coherent(dev,
-					      (npages - npages_allocated) *
-					      BA_BYTE_LEN,
-					      mr->pbl_bt_l2[bt_idx],
-					      mr->pbl_l2_dma_addr[bt_idx]);
-
-					break;
-				}
-
-				dma_free_coherent(dev, pbl_bt_sz,
-						mr->pbl_bt_l2[bt_idx],
-						mr->pbl_l2_dma_addr[bt_idx]);
-			}
-		}
-	}
-
-	kfree(mr->pbl_bt_l1);
-	kfree(mr->pbl_l1_dma_addr);
-	mr->pbl_bt_l1 = NULL;
-	mr->pbl_l1_dma_addr = NULL;
-	if (mhop_num == 3) {
-		kfree(mr->pbl_bt_l2);
-		kfree(mr->pbl_l2_dma_addr);
-		mr->pbl_bt_l2 = NULL;
-		mr->pbl_l2_dma_addr = NULL;
-	}
+static void free_mr_pbl(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr)
+{
+	hns_roce_mtr_destroy(hr_dev, &mr->pbl_mtr);
 }
 
 static void hns_roce_mr_free(struct hns_roce_dev *hr_dev,
 			     struct hns_roce_mr *mr)
 {
-	struct device *dev = hr_dev->dev;
-	int npages = 0;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	int ret;
 
 	if (mr->enabled) {
-		ret = hns_roce_hw2sw_mpt(hr_dev, NULL, key_to_hw_index(mr->key)
-					 & (hr_dev->caps.num_mtpts - 1));
+		ret = hns_roce_hw_destroy_mpt(hr_dev, NULL,
+					      key_to_hw_index(mr->key) &
+					      (hr_dev->caps.num_mtpts - 1));
 		if (ret)
-			dev_warn(dev, "HW2SW_MPT failed (%d)\n", ret);
+			ibdev_warn(ibdev, "failed to destroy mpt, ret = %d.\n",
+				   ret);
 	}
 
-	if (mr->size != ~0ULL) {
-		if (mr->type == MR_TYPE_MR)
-			npages = ib_umem_page_count(mr->umem);
-
-		if (!hr_dev->caps.pbl_hop_num)
-			dma_free_coherent(dev,
-					  (unsigned int)(npages * BA_BYTE_LEN),
-					  mr->pbl_buf, mr->pbl_dma_addr);
-		else
-			hns_roce_mhop_free(hr_dev, mr);
-	}
-
-	if (mr->enabled)
-		hns_roce_table_put(hr_dev, &hr_dev->mr_table.mtpt_table,
-				   key_to_hw_index(mr->key));
-
-	hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap,
-			     key_to_hw_index(mr->key), BITMAP_NO_RR);
+	free_mr_pbl(hr_dev, mr);
+	free_mr_key(hr_dev, mr);
 }
 
 static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev,
@@ -740,33 +171,28 @@
 	unsigned long mtpt_idx = key_to_hw_index(mr->key);
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_cmd_mailbox *mailbox;
-	struct hns_roce_mr_table *mr_table = &hr_dev->mr_table;
-
-	/* Prepare HEM entry memory */
-	ret = hns_roce_table_get(hr_dev, &mr_table->mtpt_table, mtpt_idx);
-	if (ret)
-		return ret;
 
 	/* Allocate mailbox memory */
 	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
 	if (IS_ERR(mailbox)) {
 		ret = PTR_ERR(mailbox);
-		goto err_table;
+		return ret;
 	}
 
 	if (mr->type != MR_TYPE_FRMR)
-		ret = hr_dev->hw->write_mtpt(mailbox->buf, mr, mtpt_idx);
+		ret = hr_dev->hw->write_mtpt(hr_dev, mailbox->buf, mr,
+					     mtpt_idx);
 	else
-		ret = hr_dev->hw->frmr_write_mtpt(mailbox->buf, mr);
+		ret = hr_dev->hw->frmr_write_mtpt(hr_dev, mailbox->buf, mr);
 	if (ret) {
-		dev_err(dev, "Write mtpt fail!\n");
+		dev_err(dev, "failed to write mtpt, ret = %d.\n", ret);
 		goto err_page;
 	}
 
-	ret = hns_roce_sw2hw_mpt(hr_dev, mailbox,
-				 mtpt_idx & (hr_dev->caps.num_mtpts - 1));
+	ret = hns_roce_hw_create_mpt(hr_dev, mailbox,
+				     mtpt_idx & (hr_dev->caps.num_mtpts - 1));
 	if (ret) {
-		dev_err(dev, "SW2HW_MPT failed (%d)\n", ret);
+		dev_err(dev, "failed to create mpt, ret = %d.\n", ret);
 		goto err_page;
 	}
 
@@ -778,137 +204,6 @@
 err_page:
 	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
 
-err_table:
-	hns_roce_table_put(hr_dev, &mr_table->mtpt_table, mtpt_idx);
-	return ret;
-}
-
-static int hns_roce_write_mtt_chunk(struct hns_roce_dev *hr_dev,
-				    struct hns_roce_mtt *mtt, u32 start_index,
-				    u32 npages, u64 *page_list)
-{
-	struct hns_roce_hem_table *table;
-	dma_addr_t dma_handle;
-	__le64 *mtts;
-	u32 bt_page_size;
-	u32 i;
-
-	switch (mtt->mtt_type) {
-	case MTT_TYPE_WQE:
-		table = &hr_dev->mr_table.mtt_table;
-		bt_page_size = 1 << (hr_dev->caps.mtt_ba_pg_sz + PAGE_SHIFT);
-		break;
-	case MTT_TYPE_CQE:
-		table = &hr_dev->mr_table.mtt_cqe_table;
-		bt_page_size = 1 << (hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT);
-		break;
-	case MTT_TYPE_SRQWQE:
-		table = &hr_dev->mr_table.mtt_srqwqe_table;
-		bt_page_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz + PAGE_SHIFT);
-		break;
-	case MTT_TYPE_IDX:
-		table = &hr_dev->mr_table.mtt_idx_table;
-		bt_page_size = 1 << (hr_dev->caps.idx_ba_pg_sz + PAGE_SHIFT);
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	/* All MTTs must fit in the same page */
-	if (start_index / (bt_page_size / sizeof(u64)) !=
-		(start_index + npages - 1) / (bt_page_size / sizeof(u64)))
-		return -EINVAL;
-
-	if (start_index & (HNS_ROCE_MTT_ENTRY_PER_SEG - 1))
-		return -EINVAL;
-
-	mtts = hns_roce_table_find(hr_dev, table,
-				mtt->first_seg +
-				start_index / HNS_ROCE_MTT_ENTRY_PER_SEG,
-				&dma_handle);
-	if (!mtts)
-		return -ENOMEM;
-
-	/* Save page addr, low 12 bits : 0 */
-	for (i = 0; i < npages; ++i) {
-		if (!hr_dev->caps.mtt_hop_num)
-			mtts[i] = cpu_to_le64(page_list[i] >> PAGE_ADDR_SHIFT);
-		else
-			mtts[i] = cpu_to_le64(page_list[i]);
-	}
-
-	return 0;
-}
-
-static int hns_roce_write_mtt(struct hns_roce_dev *hr_dev,
-			      struct hns_roce_mtt *mtt, u32 start_index,
-			      u32 npages, u64 *page_list)
-{
-	int chunk;
-	int ret;
-	u32 bt_page_size;
-
-	if (mtt->order < 0)
-		return -EINVAL;
-
-	switch (mtt->mtt_type) {
-	case MTT_TYPE_WQE:
-		bt_page_size = 1 << (hr_dev->caps.mtt_ba_pg_sz + PAGE_SHIFT);
-		break;
-	case MTT_TYPE_CQE:
-		bt_page_size = 1 << (hr_dev->caps.cqe_ba_pg_sz + PAGE_SHIFT);
-		break;
-	case MTT_TYPE_SRQWQE:
-		bt_page_size = 1 << (hr_dev->caps.srqwqe_ba_pg_sz + PAGE_SHIFT);
-		break;
-	case MTT_TYPE_IDX:
-		bt_page_size = 1 << (hr_dev->caps.idx_ba_pg_sz + PAGE_SHIFT);
-		break;
-	default:
-		dev_err(hr_dev->dev,
-			"Unsupport mtt type %d, write mtt failed\n",
-			mtt->mtt_type);
-		return -EINVAL;
-	}
-
-	while (npages > 0) {
-		chunk = min_t(int, bt_page_size / sizeof(u64), npages);
-
-		ret = hns_roce_write_mtt_chunk(hr_dev, mtt, start_index, chunk,
-					       page_list);
-		if (ret)
-			return ret;
-
-		npages -= chunk;
-		start_index += chunk;
-		page_list += chunk;
-	}
-
-	return 0;
-}
-
-int hns_roce_buf_write_mtt(struct hns_roce_dev *hr_dev,
-			   struct hns_roce_mtt *mtt, struct hns_roce_buf *buf)
-{
-	u64 *page_list;
-	int ret;
-	u32 i;
-
-	page_list = kmalloc_array(buf->npages, sizeof(*page_list), GFP_KERNEL);
-	if (!page_list)
-		return -ENOMEM;
-
-	for (i = 0; i < buf->npages; ++i) {
-		if (buf->nbufs == 1)
-			page_list[i] = buf->direct.map + (i << buf->page_shift);
-		else
-			page_list[i] = buf->page_list[i].map;
-
-	}
-	ret = hns_roce_write_mtt(hr_dev, mtt, 0, buf->npages, page_list);
-
-	kfree(page_list);
-
 	return ret;
 }
 
@@ -921,50 +216,6 @@
 				   hr_dev->caps.num_mtpts,
 				   hr_dev->caps.num_mtpts - 1,
 				   hr_dev->caps.reserved_mrws, 0);
-	if (ret)
-		return ret;
-
-	ret = hns_roce_buddy_init(&mr_table->mtt_buddy,
-				  ilog2(hr_dev->caps.num_mtt_segs));
-	if (ret)
-		goto err_buddy;
-
-	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE)) {
-		ret = hns_roce_buddy_init(&mr_table->mtt_cqe_buddy,
-					  ilog2(hr_dev->caps.num_cqe_segs));
-		if (ret)
-			goto err_buddy_cqe;
-	}
-
-	if (hr_dev->caps.num_srqwqe_segs) {
-		ret = hns_roce_buddy_init(&mr_table->mtt_srqwqe_buddy,
-					  ilog2(hr_dev->caps.num_srqwqe_segs));
-		if (ret)
-			goto err_buddy_srqwqe;
-	}
-
-	if (hr_dev->caps.num_idx_segs) {
-		ret = hns_roce_buddy_init(&mr_table->mtt_idx_buddy,
-					  ilog2(hr_dev->caps.num_idx_segs));
-		if (ret)
-			goto err_buddy_idx;
-	}
-
-	return 0;
-
-err_buddy_idx:
-	if (hr_dev->caps.num_srqwqe_segs)
-		hns_roce_buddy_cleanup(&mr_table->mtt_srqwqe_buddy);
-
-err_buddy_srqwqe:
-	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
-		hns_roce_buddy_cleanup(&mr_table->mtt_cqe_buddy);
-
-err_buddy_cqe:
-	hns_roce_buddy_cleanup(&mr_table->mtt_buddy);
-
-err_buddy:
-	hns_roce_bitmap_cleanup(&mr_table->mtpt_bitmap);
 	return ret;
 }
 
@@ -972,30 +223,24 @@
 {
 	struct hns_roce_mr_table *mr_table = &hr_dev->mr_table;
 
-	if (hr_dev->caps.num_idx_segs)
-		hns_roce_buddy_cleanup(&mr_table->mtt_idx_buddy);
-	if (hr_dev->caps.num_srqwqe_segs)
-		hns_roce_buddy_cleanup(&mr_table->mtt_srqwqe_buddy);
-	hns_roce_buddy_cleanup(&mr_table->mtt_buddy);
-	if (hns_roce_check_whether_mhop(hr_dev, HEM_TYPE_CQE))
-		hns_roce_buddy_cleanup(&mr_table->mtt_cqe_buddy);
 	hns_roce_bitmap_cleanup(&mr_table->mtpt_bitmap);
 }
 
 struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc)
 {
+	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
 	struct hns_roce_mr *mr;
 	int ret;
 
-	mr = kmalloc(sizeof(*mr), GFP_KERNEL);
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 	if (mr == NULL)
 		return  ERR_PTR(-ENOMEM);
 
 	mr->type = MR_TYPE_DMA;
 
 	/* Allocate memory region key */
-	ret = hns_roce_mr_alloc(to_hr_dev(pd->device), to_hr_pd(pd)->pdn, 0,
-				~0ULL, acc, 0, mr);
+	hns_roce_hem_list_init(&mr->pbl_mtr.hem_list);
+	ret = alloc_mr_key(hr_dev, mr, to_hr_pd(pd)->pdn, 0, 0, acc);
 	if (ret)
 		goto err_free;
 
@@ -1004,203 +249,52 @@
 		goto err_mr;
 
 	mr->ibmr.rkey = mr->ibmr.lkey = mr->key;
-	mr->umem = NULL;
 
 	return &mr->ibmr;
-
 err_mr:
-	hns_roce_mr_free(to_hr_dev(pd->device), mr);
+	free_mr_key(hr_dev, mr);
 
 err_free:
 	kfree(mr);
 	return ERR_PTR(ret);
 }
 
-int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev,
-			       struct hns_roce_mtt *mtt, struct ib_umem *umem)
-{
-	struct device *dev = hr_dev->dev;
-	struct sg_dma_page_iter sg_iter;
-	unsigned int order;
-	int npage = 0;
-	int ret = 0;
-	int i;
-	u64 page_addr;
-	u64 *pages;
-	u32 bt_page_size;
-	u32 n;
-
-	switch (mtt->mtt_type) {
-	case MTT_TYPE_WQE:
-		order = hr_dev->caps.mtt_ba_pg_sz;
-		break;
-	case MTT_TYPE_CQE:
-		order = hr_dev->caps.cqe_ba_pg_sz;
-		break;
-	case MTT_TYPE_SRQWQE:
-		order = hr_dev->caps.srqwqe_ba_pg_sz;
-		break;
-	case MTT_TYPE_IDX:
-		order = hr_dev->caps.idx_ba_pg_sz;
-		break;
-	default:
-		dev_err(dev, "Unsupport mtt type %d, write mtt failed\n",
-			mtt->mtt_type);
-		return -EINVAL;
-	}
-
-	bt_page_size = 1 << (order + PAGE_SHIFT);
-
-	pages = (u64 *) __get_free_pages(GFP_KERNEL, order);
-	if (!pages)
-		return -ENOMEM;
-
-	i = n = 0;
-
-	for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
-		page_addr = sg_page_iter_dma_address(&sg_iter);
-		if (!(npage % (1 << (mtt->page_shift - PAGE_SHIFT)))) {
-			if (page_addr & ((1 << mtt->page_shift) - 1)) {
-				dev_err(dev,
-					"page_addr is not page_shift %d alignment!\n",
-					mtt->page_shift);
-				ret = -EINVAL;
-				goto out;
-			}
-			pages[i++] = page_addr;
-		}
-		npage++;
-		if (i == bt_page_size / sizeof(u64)) {
-			ret = hns_roce_write_mtt(hr_dev, mtt, n, i, pages);
-			if (ret)
-				goto out;
-			n += i;
-			i = 0;
-		}
-	}
-
-	if (i)
-		ret = hns_roce_write_mtt(hr_dev, mtt, n, i, pages);
-
-out:
-	free_pages((unsigned long) pages, order);
-	return ret;
-}
-
-static int hns_roce_ib_umem_write_mr(struct hns_roce_dev *hr_dev,
-				     struct hns_roce_mr *mr,
-				     struct ib_umem *umem)
-{
-	struct sg_dma_page_iter sg_iter;
-	int i = 0, j = 0;
-	u64 page_addr;
-	u32 pbl_bt_sz;
-
-	if (hr_dev->caps.pbl_hop_num == HNS_ROCE_HOP_NUM_0)
-		return 0;
-
-	pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT);
-	for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
-		page_addr = sg_page_iter_dma_address(&sg_iter);
-		if (!hr_dev->caps.pbl_hop_num) {
-			/* for hip06, page addr is aligned to 4K */
-			mr->pbl_buf[i++] = page_addr >> 12;
-		} else if (hr_dev->caps.pbl_hop_num == 1) {
-			mr->pbl_buf[i++] = page_addr;
-		} else {
-			if (hr_dev->caps.pbl_hop_num == 2)
-				mr->pbl_bt_l1[i][j] = page_addr;
-			else if (hr_dev->caps.pbl_hop_num == 3)
-				mr->pbl_bt_l2[i][j] = page_addr;
-
-			j++;
-			if (j >= (pbl_bt_sz / BA_BYTE_LEN)) {
-				i++;
-				j = 0;
-			}
-		}
-	}
-
-	/* Memory barrier */
-	mb();
-
-	return 0;
-}
-
 struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				   u64 virt_addr, int access_flags,
 				   struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
-	struct device *dev = hr_dev->dev;
 	struct hns_roce_mr *mr;
-	int bt_size;
 	int ret;
-	int n;
-	int i;
 
-	mr = kmalloc(sizeof(*mr), GFP_KERNEL);
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	mr->umem = ib_umem_get(udata, start, length, access_flags, 0);
-	if (IS_ERR(mr->umem)) {
-		ret = PTR_ERR(mr->umem);
-		goto err_free;
-	}
-
-	n = ib_umem_page_count(mr->umem);
-
-	if (!hr_dev->caps.pbl_hop_num) {
-		if (n > HNS_ROCE_MAX_MTPT_PBL_NUM) {
-			dev_err(dev,
-			     " MR len %lld err. MR is limited to 4G at most!\n",
-			     length);
-			ret = -EINVAL;
-			goto err_umem;
-		}
-	} else {
-		u64 pbl_size = 1;
-
-		bt_size = (1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT)) /
-			  BA_BYTE_LEN;
-		for (i = 0; i < hr_dev->caps.pbl_hop_num; i++)
-			pbl_size *= bt_size;
-		if (n > pbl_size) {
-			dev_err(dev,
-			    " MR len %lld err. MR page num is limited to %lld!\n",
-			    length, pbl_size);
-			ret = -EINVAL;
-			goto err_umem;
-		}
-	}
-
 	mr->type = MR_TYPE_MR;
-
-	ret = hns_roce_mr_alloc(hr_dev, to_hr_pd(pd)->pdn, virt_addr, length,
-				access_flags, n, mr);
+	ret = alloc_mr_key(hr_dev, mr, to_hr_pd(pd)->pdn, virt_addr, length,
+			   access_flags);
 	if (ret)
-		goto err_umem;
+		goto err_alloc_mr;
 
-	ret = hns_roce_ib_umem_write_mr(hr_dev, mr, mr->umem);
+	ret = alloc_mr_pbl(hr_dev, mr, length, udata, start, access_flags);
 	if (ret)
-		goto err_mr;
+		goto err_alloc_key;
 
 	ret = hns_roce_mr_enable(hr_dev, mr);
 	if (ret)
-		goto err_mr;
+		goto err_alloc_pbl;
 
 	mr->ibmr.rkey = mr->ibmr.lkey = mr->key;
+	mr->ibmr.length = length;
 
 	return &mr->ibmr;
 
-err_mr:
-	hns_roce_mr_free(hr_dev, mr);
-
-err_umem:
-	ib_umem_release(mr->umem);
-
-err_free:
+err_alloc_pbl:
+	free_mr_pbl(hr_dev, mr);
+err_alloc_key:
+	free_mr_key(hr_dev, mr);
+err_alloc_mr:
 	kfree(mr);
 	return ERR_PTR(ret);
 }
@@ -1212,84 +306,36 @@
 			  u32 pdn, struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device);
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_mr *mr = to_hr_mr(ibmr);
-	struct device *dev = hr_dev->dev;
-	int npages;
 	int ret;
 
-	if (mr->size != ~0ULL) {
-		npages = ib_umem_page_count(mr->umem);
-
-		if (hr_dev->caps.pbl_hop_num)
-			hns_roce_mhop_free(hr_dev, mr);
-		else
-			dma_free_coherent(dev, npages * 8,
-					  mr->pbl_buf, mr->pbl_dma_addr);
-	}
-	ib_umem_release(mr->umem);
-
-	mr->umem = ib_umem_get(udata, start, length, mr_access_flags, 0);
-	if (IS_ERR(mr->umem)) {
-		ret = PTR_ERR(mr->umem);
-		mr->umem = NULL;
-		return -ENOMEM;
-	}
-	npages = ib_umem_page_count(mr->umem);
-
-	if (hr_dev->caps.pbl_hop_num) {
-		ret = hns_roce_mhop_alloc(hr_dev, npages, mr);
-		if (ret)
-			goto release_umem;
-	} else {
-		mr->pbl_buf = dma_alloc_coherent(dev, npages * 8,
-						 &(mr->pbl_dma_addr),
-						 GFP_KERNEL);
-		if (!mr->pbl_buf) {
-			ret = -ENOMEM;
-			goto release_umem;
-		}
+	free_mr_pbl(hr_dev, mr);
+	ret = alloc_mr_pbl(hr_dev, mr, length, udata, start, mr_access_flags);
+	if (ret) {
+		ibdev_err(ibdev, "failed to create mr PBL, ret = %d.\n", ret);
+		return ret;
 	}
 
 	ret = hr_dev->hw->rereg_write_mtpt(hr_dev, mr, flags, pdn,
 					   mr_access_flags, virt_addr,
 					   length, mailbox->buf);
-	if (ret)
-		goto release_umem;
-
-
-	ret = hns_roce_ib_umem_write_mr(hr_dev, mr, mr->umem);
 	if (ret) {
-		if (mr->size != ~0ULL) {
-			npages = ib_umem_page_count(mr->umem);
-
-			if (hr_dev->caps.pbl_hop_num)
-				hns_roce_mhop_free(hr_dev, mr);
-			else
-				dma_free_coherent(dev, npages * 8,
-						  mr->pbl_buf,
-						  mr->pbl_dma_addr);
-		}
-
-		goto release_umem;
+		ibdev_err(ibdev, "failed to write mtpt, ret = %d.\n", ret);
+		free_mr_pbl(hr_dev, mr);
 	}
 
-	return 0;
-
-release_umem:
-	ib_umem_release(mr->umem);
 	return ret;
-
 }
 
-
 int hns_roce_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length,
 			   u64 virt_addr, int mr_access_flags, struct ib_pd *pd,
 			   struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device);
+	struct ib_device *ib_dev = &hr_dev->ib_dev;
 	struct hns_roce_mr *mr = to_hr_mr(ibmr);
 	struct hns_roce_cmd_mailbox *mailbox;
-	struct device *dev = hr_dev->dev;
 	unsigned long mtpt_idx;
 	u32 pdn = 0;
 	int ret;
@@ -1308,9 +354,9 @@
 	if (ret)
 		goto free_cmd_mbox;
 
-	ret = hns_roce_hw2sw_mpt(hr_dev, NULL, mtpt_idx);
+	ret = hns_roce_hw_destroy_mpt(hr_dev, NULL, mtpt_idx);
 	if (ret)
-		dev_warn(dev, "HW2SW_MPT failed (%d)\n", ret);
+		ibdev_warn(ib_dev, "failed to destroy MPT, ret = %d.\n", ret);
 
 	mr->enabled = 0;
 
@@ -1332,10 +378,9 @@
 			goto free_cmd_mbox;
 	}
 
-	ret = hns_roce_sw2hw_mpt(hr_dev, mailbox, mtpt_idx);
+	ret = hns_roce_hw_create_mpt(hr_dev, mailbox, mtpt_idx);
 	if (ret) {
-		dev_err(dev, "SW2HW_MPT failed (%d)\n", ret);
-		ib_umem_release(mr->umem);
+		ibdev_err(ib_dev, "failed to create MPT, ret = %d.\n", ret);
 		goto free_cmd_mbox;
 	}
 
@@ -1363,8 +408,6 @@
 		ret = hr_dev->hw->dereg_mr(hr_dev, mr, udata);
 	} else {
 		hns_roce_mr_free(hr_dev, mr);
-
-		ib_umem_release(mr->umem);
 		kfree(mr);
 	}
 
@@ -1372,18 +415,14 @@
 }
 
 struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-				u32 max_num_sg, struct ib_udata *udata)
+				u32 max_num_sg)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
 	struct device *dev = hr_dev->dev;
 	struct hns_roce_mr *mr;
 	u64 length;
-	u32 page_size;
 	int ret;
 
-	page_size = 1 << (hr_dev->caps.pbl_buf_pg_sz + PAGE_SHIFT);
-	length = max_num_sg * page_size;
-
 	if (mr_type != IB_MR_TYPE_MEM_REG)
 		return ERR_PTR(-EINVAL);
 
@@ -1400,23 +439,28 @@
 	mr->type = MR_TYPE_FRMR;
 
 	/* Allocate memory region key */
-	ret = hns_roce_mr_alloc(hr_dev, to_hr_pd(pd)->pdn, 0, length,
-				0, max_num_sg, mr);
+	length = max_num_sg * (1 << PAGE_SHIFT);
+	ret = alloc_mr_key(hr_dev, mr, to_hr_pd(pd)->pdn, 0, length, 0);
 	if (ret)
 		goto err_free;
 
+	ret = alloc_mr_pbl(hr_dev, mr, length, NULL, 0, 0);
+	if (ret)
+		goto err_key;
+
 	ret = hns_roce_mr_enable(hr_dev, mr);
 	if (ret)
-		goto err_mr;
+		goto err_pbl;
 
 	mr->ibmr.rkey = mr->ibmr.lkey = mr->key;
-	mr->umem = NULL;
+	mr->ibmr.length = length;
 
 	return &mr->ibmr;
 
-err_mr:
-	hns_roce_mr_free(to_hr_dev(pd->device), mr);
-
+err_key:
+	free_mr_key(hr_dev, mr);
+err_pbl:
+	free_mr_pbl(hr_dev, mr);
 err_free:
 	kfree(mr);
 	return ERR_PTR(ret);
@@ -1426,19 +470,54 @@
 {
 	struct hns_roce_mr *mr = to_hr_mr(ibmr);
 
-	mr->pbl_buf[mr->npages++] = addr;
+	if (likely(mr->npages < mr->pbl_mtr.hem_cfg.buf_pg_count)) {
+		mr->page_list[mr->npages++] = addr;
+		return 0;
+	}
 
-	return 0;
+	return -ENOBUFS;
 }
 
 int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		       unsigned int *sg_offset)
 {
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device);
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_mr *mr = to_hr_mr(ibmr);
+	struct hns_roce_mtr *mtr = &mr->pbl_mtr;
+	int ret = 0;
 
 	mr->npages = 0;
+	mr->page_list = kvcalloc(mr->pbl_mtr.hem_cfg.buf_pg_count,
+				 sizeof(dma_addr_t), GFP_KERNEL);
+	if (!mr->page_list)
+		return ret;
 
-	return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, hns_roce_set_page);
+	ret = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, hns_roce_set_page);
+	if (ret < 1) {
+		ibdev_err(ibdev, "failed to store sg pages %u %u, cnt = %d.\n",
+			  mr->npages, mr->pbl_mtr.hem_cfg.buf_pg_count, ret);
+		goto err_page_list;
+	}
+
+	mtr->hem_cfg.region[0].offset = 0;
+	mtr->hem_cfg.region[0].count = mr->npages;
+	mtr->hem_cfg.region[0].hopnum = mr->pbl_hop_num;
+	mtr->hem_cfg.region_count = 1;
+	ret = hns_roce_mtr_map(hr_dev, mtr, mr->page_list, mr->npages);
+	if (ret) {
+		ibdev_err(ibdev, "failed to map sg mtr, ret = %d.\n", ret);
+		ret = 0;
+	} else {
+		mr->pbl_mtr.hem_cfg.buf_pg_shift = ilog2(ibmr->page_size);
+		ret = mr->npages;
+	}
+
+err_page_list:
+	kvfree(mr->page_list);
+	mr->page_list = NULL;
+
+	return ret;
 }
 
 static void hns_roce_mw_free(struct hns_roce_dev *hr_dev,
@@ -1448,10 +527,11 @@
 	int ret;
 
 	if (mw->enabled) {
-		ret = hns_roce_hw2sw_mpt(hr_dev, NULL, key_to_hw_index(mw->rkey)
-					 & (hr_dev->caps.num_mtpts - 1));
+		ret = hns_roce_hw_destroy_mpt(hr_dev, NULL,
+					      key_to_hw_index(mw->rkey) &
+					      (hr_dev->caps.num_mtpts - 1));
 		if (ret)
-			dev_warn(dev, "MW HW2SW_MPT failed (%d)\n", ret);
+			dev_warn(dev, "MW DESTROY_MPT failed (%d)\n", ret);
 
 		hns_roce_table_put(hr_dev, &hr_dev->mr_table.mtpt_table,
 				   key_to_hw_index(mw->rkey));
@@ -1487,10 +567,10 @@
 		goto err_page;
 	}
 
-	ret = hns_roce_sw2hw_mpt(hr_dev, mailbox,
-				 mtpt_idx & (hr_dev->caps.num_mtpts - 1));
+	ret = hns_roce_hw_create_mpt(hr_dev, mailbox,
+				     mtpt_idx & (hr_dev->caps.num_mtpts - 1));
 	if (ret) {
-		dev_err(dev, "MW sw2hw_mpt failed (%d)\n", ret);
+		dev_err(dev, "MW CREATE_MPT failed (%d)\n", ret);
 		goto err_page;
 	}
 
@@ -1509,28 +589,22 @@
 	return ret;
 }
 
-struct ib_mw *hns_roce_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type,
-				struct ib_udata *udata)
+int hns_roce_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
 {
-	struct hns_roce_dev *hr_dev = to_hr_dev(ib_pd->device);
-	struct hns_roce_mw *mw;
+	struct hns_roce_dev *hr_dev = to_hr_dev(ibmw->device);
+	struct hns_roce_mw *mw = to_hr_mw(ibmw);
 	unsigned long index = 0;
 	int ret;
 
-	mw = kmalloc(sizeof(*mw), GFP_KERNEL);
-	if (!mw)
-		return ERR_PTR(-ENOMEM);
-
 	/* Allocate a key for mw from bitmap */
 	ret = hns_roce_bitmap_alloc(&hr_dev->mr_table.mtpt_bitmap, &index);
 	if (ret)
-		goto err_bitmap;
+		return ret;
 
 	mw->rkey = hw_index_to_key(index);
 
-	mw->ibmw.rkey = mw->rkey;
-	mw->ibmw.type = type;
-	mw->pdn = to_hr_pd(ib_pd)->pdn;
+	ibmw->rkey = mw->rkey;
+	mw->pdn = to_hr_pd(ibmw->pd)->pdn;
 	mw->pbl_hop_num = hr_dev->caps.pbl_hop_num;
 	mw->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz;
 	mw->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz;
@@ -1539,15 +613,11 @@
 	if (ret)
 		goto err_mw;
 
-	return &mw->ibmw;
+	return 0;
 
 err_mw:
 	hns_roce_mw_free(hr_dev, mw);
-
-err_bitmap:
-	kfree(mw);
-
-	return ERR_PTR(ret);
+	return ret;
 }
 
 int hns_roce_dealloc_mw(struct ib_mw *ibmw)
@@ -1556,37 +626,26 @@
 	struct hns_roce_mw *mw = to_hr_mw(ibmw);
 
 	hns_roce_mw_free(hr_dev, mw);
-	kfree(mw);
-
 	return 0;
 }
 
-void hns_roce_mtr_init(struct hns_roce_mtr *mtr, int bt_pg_shift,
-		       int buf_pg_shift)
+static int mtr_map_region(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+			  dma_addr_t *pages, struct hns_roce_buf_region *region)
 {
-	hns_roce_hem_list_init(&mtr->hem_list, bt_pg_shift);
-	mtr->buf_pg_shift = buf_pg_shift;
-}
-
-void hns_roce_mtr_cleanup(struct hns_roce_dev *hr_dev,
-			  struct hns_roce_mtr *mtr)
-{
-	hns_roce_hem_list_release(hr_dev, &mtr->hem_list);
-}
-
-static int hns_roce_write_mtr(struct hns_roce_dev *hr_dev,
-			      struct hns_roce_mtr *mtr, dma_addr_t *bufs,
-			      struct hns_roce_buf_region *r)
-{
+	__le64 *mtts;
 	int offset;
 	int count;
 	int npage;
-	u64 *mtts;
+	u64 addr;
 	int end;
 	int i;
 
-	offset = r->offset;
-	end = offset + r->count;
+	/* if hopnum is 0, buffer cannot store BAs, so skip write mtt */
+	if (!region->hopnum)
+		return 0;
+
+	offset = region->offset;
+	end = offset + region->count;
 	npage = 0;
 	while (offset < end) {
 		mtts = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
@@ -1594,13 +653,13 @@
 		if (!mtts)
 			return -ENOBUFS;
 
-		/* Save page addr, low 12 bits : 0 */
 		for (i = 0; i < count; i++) {
 			if (hr_dev->hw_rev == HNS_ROCE_HW_VER1)
-				mtts[i] = bufs[npage] >> PAGE_ADDR_SHIFT;
+				addr = to_hr_hw_page_addr(pages[npage]);
 			else
-				mtts[i] = bufs[npage];
+				addr = pages[npage];
 
+			mtts[i] = cpu_to_le64(addr);
 			npage++;
 		}
 		offset += count;
@@ -1609,69 +668,449 @@
 	return 0;
 }
 
-int hns_roce_mtr_attach(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
-			dma_addr_t **bufs, struct hns_roce_buf_region *regions,
-			int region_cnt)
+static inline bool mtr_has_mtt(struct hns_roce_buf_attr *attr)
 {
-	struct hns_roce_buf_region *r;
-	int ret;
 	int i;
 
-	ret = hns_roce_hem_list_request(hr_dev, &mtr->hem_list, regions,
-					region_cnt);
-	if (ret)
-		return ret;
+	for (i = 0; i < attr->region_count; i++)
+		if (attr->region[i].hopnum != HNS_ROCE_HOP_NUM_0 &&
+		    attr->region[i].hopnum > 0)
+			return true;
 
-	for (i = 0; i < region_cnt; i++) {
-		r = &regions[i];
-		ret = hns_roce_write_mtr(hr_dev, mtr, bufs[i], r);
+	/* because the mtr only one root base address, when hopnum is 0 means
+	 * root base address equals the first buffer address, thus all alloced
+	 * memory must in a continuous space accessed by direct mode.
+	 */
+	return false;
+}
+
+static inline size_t mtr_bufs_size(struct hns_roce_buf_attr *attr)
+{
+	size_t size = 0;
+	int i;
+
+	for (i = 0; i < attr->region_count; i++)
+		size += attr->region[i].size;
+
+	return size;
+}
+
+static inline size_t mtr_kmem_direct_size(bool is_direct, size_t alloc_size,
+					  unsigned int page_shift)
+{
+	if (is_direct)
+		return ALIGN(alloc_size, 1 << page_shift);
+	else
+		return HNS_HW_DIRECT_PAGE_COUNT << page_shift;
+}
+
+/*
+ * check the given pages in continuous address space
+ * Returns 0 on success, or the error page num.
+ */
+static inline int mtr_check_direct_pages(dma_addr_t *pages, int page_count,
+					 unsigned int page_shift)
+{
+	size_t page_size = 1 << page_shift;
+	int i;
+
+	for (i = 1; i < page_count; i++)
+		if (pages[i] - pages[i - 1] != page_size)
+			return i;
+
+	return 0;
+}
+
+static void mtr_free_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr)
+{
+	/* release user buffers */
+	if (mtr->umem) {
+		ib_umem_release(mtr->umem);
+		mtr->umem = NULL;
+	}
+
+	/* release kernel buffers */
+	if (mtr->kmem) {
+		hns_roce_buf_free(hr_dev, mtr->kmem);
+		kfree(mtr->kmem);
+		mtr->kmem = NULL;
+	}
+}
+
+static int mtr_alloc_bufs(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+			  struct hns_roce_buf_attr *buf_attr, bool is_direct,
+			  struct ib_udata *udata, unsigned long user_addr)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	unsigned int best_pg_shift;
+	int all_pg_count = 0;
+	size_t direct_size;
+	size_t total_size;
+	int ret;
+
+	total_size = mtr_bufs_size(buf_attr);
+	if (total_size < 1) {
+		ibdev_err(ibdev, "Failed to check mtr size\n");
+		return -EINVAL;
+	}
+
+	if (udata) {
+		unsigned long pgsz_bitmap;
+		unsigned long page_size;
+
+		mtr->kmem = NULL;
+		mtr->umem = ib_umem_get(ibdev, user_addr, total_size,
+					buf_attr->user_access);
+		if (IS_ERR_OR_NULL(mtr->umem)) {
+			ibdev_err(ibdev, "Failed to get umem, ret %ld\n",
+				  PTR_ERR(mtr->umem));
+			return -ENOMEM;
+		}
+		if (buf_attr->fixed_page)
+			pgsz_bitmap = 1 << buf_attr->page_shift;
+		else
+			pgsz_bitmap = GENMASK(buf_attr->page_shift, PAGE_SHIFT);
+
+		page_size = ib_umem_find_best_pgsz(mtr->umem, pgsz_bitmap,
+						   user_addr);
+		if (!page_size)
+			return -EINVAL;
+		best_pg_shift = order_base_2(page_size);
+		all_pg_count = ib_umem_num_dma_blocks(mtr->umem, page_size);
+		ret = 0;
+	} else {
+		mtr->umem = NULL;
+		mtr->kmem = kzalloc(sizeof(*mtr->kmem), GFP_KERNEL);
+		if (!mtr->kmem) {
+			ibdev_err(ibdev, "Failed to alloc kmem\n");
+			return -ENOMEM;
+		}
+		direct_size = mtr_kmem_direct_size(is_direct, total_size,
+						   buf_attr->page_shift);
+		ret = hns_roce_buf_alloc(hr_dev, total_size, direct_size,
+					 mtr->kmem, buf_attr->page_shift);
 		if (ret) {
-			dev_err(hr_dev->dev,
-				"write mtr[%d/%d] err %d,offset=%d.\n",
-				i, region_cnt, ret,  r->offset);
-			goto err_write;
+			ibdev_err(ibdev, "Failed to alloc kmem, ret %d\n", ret);
+			goto err_alloc_mem;
+		}
+		best_pg_shift = buf_attr->page_shift;
+		all_pg_count = mtr->kmem->npages;
+	}
+
+	/* must bigger than minimum hardware page shift */
+	if (best_pg_shift < HNS_HW_PAGE_SHIFT || all_pg_count < 1) {
+		ret = -EINVAL;
+		ibdev_err(ibdev, "Failed to check mtr page shift %d count %d\n",
+			  best_pg_shift, all_pg_count);
+		goto err_alloc_mem;
+	}
+
+	mtr->hem_cfg.buf_pg_shift = best_pg_shift;
+	mtr->hem_cfg.buf_pg_count = all_pg_count;
+
+	return 0;
+err_alloc_mem:
+	mtr_free_bufs(hr_dev, mtr);
+	return ret;
+}
+
+static int mtr_get_pages(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+			 dma_addr_t *pages, int count, unsigned int page_shift)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	int npage;
+	int err;
+
+	if (mtr->umem)
+		npage = hns_roce_get_umem_bufs(hr_dev, pages, count, 0,
+					       mtr->umem, page_shift);
+	else
+		npage = hns_roce_get_kmem_bufs(hr_dev, pages, count, 0,
+					       mtr->kmem);
+
+	if (mtr->hem_cfg.is_direct && npage > 1) {
+		err = mtr_check_direct_pages(pages, npage, page_shift);
+		if (err) {
+			ibdev_err(ibdev, "Failed to check %s direct page-%d\n",
+				  mtr->umem ? "user" : "kernel", err);
+			npage = err;
+		}
+	}
+
+	return npage;
+}
+
+int hns_roce_mtr_map(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+		     dma_addr_t *pages, int page_cnt)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_buf_region *r;
+	int err;
+	int i;
+
+	/*
+	 * Only use the first page address as root ba when hopnum is 0, this
+	 * is because the addresses of all pages are consecutive in this case.
+	 */
+	if (mtr->hem_cfg.is_direct) {
+		mtr->hem_cfg.root_ba = pages[0];
+		return 0;
+	}
+
+	for (i = 0; i < mtr->hem_cfg.region_count; i++) {
+		r = &mtr->hem_cfg.region[i];
+		if (r->offset + r->count > page_cnt) {
+			err = -EINVAL;
+			ibdev_err(ibdev,
+				  "failed to check mtr%u end %u + %u, max %u.\n",
+				  i, r->offset, r->count, page_cnt);
+			return err;
+		}
+
+		err = mtr_map_region(hr_dev, mtr, &pages[r->offset], r);
+		if (err) {
+			ibdev_err(ibdev,
+				  "failed to map mtr%u offset %u, ret = %d.\n",
+				  i, r->offset, err);
+			return err;
 		}
 	}
 
 	return 0;
-
-err_write:
-	hns_roce_hem_list_release(hr_dev, &mtr->hem_list);
-
-	return ret;
 }
 
 int hns_roce_mtr_find(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
 		      int offset, u64 *mtt_buf, int mtt_max, u64 *base_addr)
 {
-	u64 *mtts = mtt_buf;
+	struct hns_roce_hem_cfg *cfg = &mtr->hem_cfg;
+	int start_index;
 	int mtt_count;
 	int total = 0;
-	u64 *addr;
+	__le64 *mtts;
 	int npage;
+	u64 addr;
 	int left;
 
-	if (mtts == NULL || mtt_max < 1)
+	if (!mtt_buf || mtt_max < 1)
 		goto done;
 
+	/* no mtt memory in direct mode, so just return the buffer address */
+	if (cfg->is_direct) {
+		start_index = offset >> HNS_HW_PAGE_SHIFT;
+		for (mtt_count = 0; mtt_count < cfg->region_count &&
+		     total < mtt_max; mtt_count++) {
+			npage = cfg->region[mtt_count].offset;
+			if (npage < start_index)
+				continue;
+
+			addr = cfg->root_ba + (npage << HNS_HW_PAGE_SHIFT);
+			if (hr_dev->hw_rev == HNS_ROCE_HW_VER1)
+				mtt_buf[total] = to_hr_hw_page_addr(addr);
+			else
+				mtt_buf[total] = addr;
+
+			total++;
+		}
+
+		goto done;
+	}
+
+	start_index = offset >> cfg->buf_pg_shift;
 	left = mtt_max;
 	while (left > 0) {
 		mtt_count = 0;
-		addr = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
-						  offset + total,
+		mtts = hns_roce_hem_list_find_mtt(hr_dev, &mtr->hem_list,
+						  start_index + total,
 						  &mtt_count, NULL);
-		if (!addr || !mtt_count)
+		if (!mtts || !mtt_count)
 			goto done;
 
 		npage = min(mtt_count, left);
-		memcpy(&mtts[total], addr, BA_BYTE_LEN * npage);
 		left -= npage;
-		total += npage;
+		for (mtt_count = 0; mtt_count < npage; mtt_count++)
+			mtt_buf[total++] = le64_to_cpu(mtts[mtt_count]);
 	}
 
 done:
 	if (base_addr)
-		*base_addr = mtr->hem_list.root_ba;
+		*base_addr = cfg->root_ba;
 
 	return total;
 }
+
+static int mtr_init_buf_cfg(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_buf_attr *attr,
+			    struct hns_roce_hem_cfg *cfg,
+			    unsigned int *buf_page_shift)
+{
+	struct hns_roce_buf_region *r;
+	unsigned int page_shift;
+	int page_cnt = 0;
+	size_t buf_size;
+	int region_cnt;
+
+	if (cfg->is_direct) {
+		buf_size = cfg->buf_pg_count << cfg->buf_pg_shift;
+		page_cnt = DIV_ROUND_UP(buf_size, HNS_HW_PAGE_SIZE);
+		/*
+		 * When HEM buffer use level-0 addressing, the page size equals
+		 * the buffer size, and the the page size = 4K * 2^N.
+		 */
+		cfg->buf_pg_shift = HNS_HW_PAGE_SHIFT + order_base_2(page_cnt);
+		if (attr->region_count > 1) {
+			cfg->buf_pg_count = page_cnt;
+			page_shift = HNS_HW_PAGE_SHIFT;
+		} else {
+			cfg->buf_pg_count = 1;
+			page_shift = cfg->buf_pg_shift;
+			if (buf_size != 1 << page_shift) {
+				ibdev_err(&hr_dev->ib_dev,
+					  "failed to check direct size %zu shift %d.\n",
+					  buf_size, page_shift);
+				return -EINVAL;
+			}
+		}
+	} else {
+		page_shift = cfg->buf_pg_shift;
+	}
+
+	/* convert buffer size to page index and page count */
+	for (page_cnt = 0, region_cnt = 0; page_cnt < cfg->buf_pg_count &&
+	     region_cnt < attr->region_count &&
+	     region_cnt < ARRAY_SIZE(cfg->region); region_cnt++) {
+		r = &cfg->region[region_cnt];
+		r->offset = page_cnt;
+		buf_size = hr_hw_page_align(attr->region[region_cnt].size);
+		r->count = DIV_ROUND_UP(buf_size, 1 << page_shift);
+		page_cnt += r->count;
+		r->hopnum = to_hr_hem_hopnum(attr->region[region_cnt].hopnum,
+					     r->count);
+	}
+
+	if (region_cnt < 1) {
+		ibdev_err(&hr_dev->ib_dev,
+			  "failed to check mtr region count, pages = %d.\n",
+			  cfg->buf_pg_count);
+		return -ENOBUFS;
+	}
+
+	cfg->region_count = region_cnt;
+	*buf_page_shift = page_shift;
+
+	return page_cnt;
+}
+
+/**
+ * hns_roce_mtr_create - Create hns memory translate region.
+ *
+ * @mtr: memory translate region
+ * @buf_attr: buffer attribute for creating mtr
+ * @ba_page_shift: page shift for multi-hop base address table
+ * @udata: user space context, if it's NULL, means kernel space
+ * @user_addr: userspace virtual address to start at
+ */
+int hns_roce_mtr_create(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr,
+			struct hns_roce_buf_attr *buf_attr,
+			unsigned int ba_page_shift, struct ib_udata *udata,
+			unsigned long user_addr)
+{
+	struct hns_roce_hem_cfg *cfg = &mtr->hem_cfg;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	unsigned int buf_page_shift = 0;
+	dma_addr_t *pages = NULL;
+	int all_pg_cnt;
+	int get_pg_cnt;
+	int ret = 0;
+
+	/* if disable mtt, all pages must in a continuous address range */
+	cfg->is_direct = !mtr_has_mtt(buf_attr);
+
+	/* if buffer only need mtt, just init the hem cfg */
+	if (buf_attr->mtt_only) {
+		cfg->buf_pg_shift = buf_attr->page_shift;
+		cfg->buf_pg_count = mtr_bufs_size(buf_attr) >>
+				    buf_attr->page_shift;
+		mtr->umem = NULL;
+		mtr->kmem = NULL;
+	} else {
+		ret = mtr_alloc_bufs(hr_dev, mtr, buf_attr, cfg->is_direct,
+				     udata, user_addr);
+		if (ret) {
+			ibdev_err(ibdev,
+				  "failed to alloc mtr bufs, ret = %d.\n", ret);
+			return ret;
+		}
+	}
+
+	all_pg_cnt = mtr_init_buf_cfg(hr_dev, buf_attr, cfg, &buf_page_shift);
+	if (all_pg_cnt < 1) {
+		ret = -ENOBUFS;
+		ibdev_err(ibdev, "failed to init mtr buf cfg.\n");
+		goto err_alloc_bufs;
+	}
+
+	hns_roce_hem_list_init(&mtr->hem_list);
+	if (!cfg->is_direct) {
+		ret = hns_roce_hem_list_request(hr_dev, &mtr->hem_list,
+						cfg->region, cfg->region_count,
+						ba_page_shift);
+		if (ret) {
+			ibdev_err(ibdev, "failed to request mtr hem, ret = %d.\n",
+				  ret);
+			goto err_alloc_bufs;
+		}
+		cfg->root_ba = mtr->hem_list.root_ba;
+		cfg->ba_pg_shift = ba_page_shift;
+	} else {
+		cfg->ba_pg_shift = cfg->buf_pg_shift;
+	}
+
+	/* no buffer to map */
+	if (buf_attr->mtt_only)
+		return 0;
+
+	/* alloc a tmp array to store buffer's dma address */
+	pages = kvcalloc(all_pg_cnt, sizeof(dma_addr_t), GFP_KERNEL);
+	if (!pages) {
+		ret = -ENOMEM;
+		ibdev_err(ibdev, "failed to alloc mtr page list %d.\n",
+			  all_pg_cnt);
+		goto err_alloc_hem_list;
+	}
+
+	get_pg_cnt = mtr_get_pages(hr_dev, mtr, pages, all_pg_cnt,
+				   buf_page_shift);
+	if (get_pg_cnt != all_pg_cnt) {
+		ibdev_err(ibdev, "failed to get mtr page %d != %d.\n",
+			  get_pg_cnt, all_pg_cnt);
+		ret = -ENOBUFS;
+		goto err_alloc_page_list;
+	}
+
+	/* write buffer's dma address to BA table */
+	ret = hns_roce_mtr_map(hr_dev, mtr, pages, all_pg_cnt);
+	if (ret) {
+		ibdev_err(ibdev, "failed to map mtr pages, ret = %d.\n", ret);
+		goto err_alloc_page_list;
+	}
+
+	/* drop tmp array */
+	kvfree(pages);
+	return 0;
+err_alloc_page_list:
+	kvfree(pages);
+err_alloc_hem_list:
+	hns_roce_hem_list_release(hr_dev, &mtr->hem_list);
+err_alloc_bufs:
+	mtr_free_bufs(hr_dev, mtr);
+	return ret;
+}
+
+void hns_roce_mtr_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_mtr *mtr)
+{
+	/* release multi-hop addressing resource */
+	hns_roce_hem_list_release(hr_dev, &mtr->hem_list);
+
+	/* free buffers */
+	mtr_free_bufs(hr_dev, mtr);
+}
diff --git a/drivers/infiniband/hw/hns/hns_roce_pd.c b/drivers/infiniband/hw/hns/hns_roce_pd.c
index 912b89b..012a769 100644
--- a/drivers/infiniband/hw/hns/hns_roce_pd.c
+++ b/drivers/infiniband/hw/hns/hns_roce_pd.c
@@ -60,33 +60,33 @@
 int hns_roce_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
 	struct ib_device *ib_dev = ibpd->device;
-	struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
-	struct device *dev = hr_dev->dev;
 	struct hns_roce_pd *pd = to_hr_pd(ibpd);
 	int ret;
 
 	ret = hns_roce_pd_alloc(to_hr_dev(ib_dev), &pd->pdn);
 	if (ret) {
-		dev_err(dev, "[alloc_pd]hns_roce_pd_alloc failed!\n");
+		ibdev_err(ib_dev, "failed to alloc pd, ret = %d.\n", ret);
 		return ret;
 	}
 
 	if (udata) {
-		struct hns_roce_ib_alloc_pd_resp uresp = {.pdn = pd->pdn};
+		struct hns_roce_ib_alloc_pd_resp resp = {.pdn = pd->pdn};
 
-		if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
+		ret = ib_copy_to_udata(udata, &resp,
+				       min(udata->outlen, sizeof(resp)));
+		if (ret) {
 			hns_roce_pd_free(to_hr_dev(ib_dev), pd->pdn);
-			dev_err(dev, "[alloc_pd]ib_copy_to_udata failed!\n");
-			return -EFAULT;
+			ibdev_err(ib_dev, "failed to copy to udata, ret = %d\n", ret);
 		}
 	}
 
-	return 0;
+	return ret;
 }
 
-void hns_roce_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+int hns_roce_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
 	hns_roce_pd_free(to_hr_dev(pd->device), to_hr_pd(pd)->pdn);
+	return 0;
 }
 
 int hns_roce_uar_alloc(struct hns_roce_dev *hr_dev, struct hns_roce_uar *uar)
@@ -96,7 +96,7 @@
 
 	/* Using bitmap to manager UAR index */
 	ret = hns_roce_bitmap_alloc(&hr_dev->uar_table.bitmap, &uar->logic_idx);
-	if (ret == -1)
+	if (ret)
 		return -ENOMEM;
 
 	if (uar->logic_idx > 0 && hr_dev->caps.phy_num_uars > 1)
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index 730e50c..291e06d 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -41,7 +41,44 @@
 #include "hns_roce_hem.h"
 #include <rdma/hns-abi.h>
 
-#define SQP_NUM				(2 * HNS_ROCE_MAX_PORTS)
+static void flush_work_handle(struct work_struct *work)
+{
+	struct hns_roce_work *flush_work = container_of(work,
+					struct hns_roce_work, work);
+	struct hns_roce_qp *hr_qp = container_of(flush_work,
+					struct hns_roce_qp, flush_work);
+	struct device *dev = flush_work->hr_dev->dev;
+	struct ib_qp_attr attr;
+	int attr_mask;
+	int ret;
+
+	attr_mask = IB_QP_STATE;
+	attr.qp_state = IB_QPS_ERR;
+
+	if (test_and_clear_bit(HNS_ROCE_FLUSH_FLAG, &hr_qp->flush_flag)) {
+		ret = hns_roce_modify_qp(&hr_qp->ibqp, &attr, attr_mask, NULL);
+		if (ret)
+			dev_err(dev, "Modify QP to error state failed(%d) during CQE flush\n",
+				ret);
+	}
+
+	/*
+	 * make sure we signal QP destroy leg that flush QP was completed
+	 * so that it can safely proceed ahead now and destroy QP
+	 */
+	if (atomic_dec_and_test(&hr_qp->refcount))
+		complete(&hr_qp->free);
+}
+
+void init_flush_work(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
+{
+	struct hns_roce_work *flush_work = &hr_qp->flush_work;
+
+	flush_work->hr_dev = hr_dev;
+	INIT_WORK(&flush_work->work, flush_work_handle);
+	atomic_inc(&hr_qp->refcount);
+	queue_work(hr_dev->irq_workq, &flush_work->work);
+}
 
 void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type)
 {
@@ -59,6 +96,15 @@
 		return;
 	}
 
+	if (hr_dev->hw_rev != HNS_ROCE_HW_VER1 &&
+	    (event_type == HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR ||
+	     event_type == HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR ||
+	     event_type == HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR)) {
+		qp->state = IB_QPS_ERR;
+		if (!test_and_set_bit(HNS_ROCE_FLUSH_FLAG, &qp->flush_flag))
+			init_flush_work(hr_dev, qp);
+	}
+
 	qp->event(qp, (enum hns_roce_event)event_type);
 
 	if (atomic_dec_and_test(&qp->refcount))
@@ -108,15 +154,34 @@
 	}
 }
 
-static int hns_roce_reserve_range_qp(struct hns_roce_dev *hr_dev, int cnt,
-				     int align, unsigned long *base)
+static int alloc_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
-	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
+	unsigned long num = 0;
+	int ret;
 
-	return hns_roce_bitmap_alloc_range(&qp_table->bitmap, cnt, align,
-					   base) ?
-		       -ENOMEM :
-		       0;
+	if (hr_qp->ibqp.qp_type == IB_QPT_GSI) {
+		/* when hw version is v1, the sqpn is allocated */
+		if (hr_dev->hw_rev == HNS_ROCE_HW_VER1)
+			num = HNS_ROCE_MAX_PORTS +
+			      hr_dev->iboe.phy_port[hr_qp->port];
+		else
+			num = 1;
+
+		hr_qp->doorbell_qpn = 1;
+	} else {
+		ret = hns_roce_bitmap_alloc_range(&hr_dev->qp_table.bitmap,
+						  1, 1, &num);
+		if (ret) {
+			ibdev_err(&hr_dev->ib_dev, "Failed to alloc bitmap\n");
+			return -ENOMEM;
+		}
+
+		hr_qp->doorbell_qpn = (u32)num;
+	}
+
+	hr_qp->qpn = num;
+
+	return 0;
 }
 
 enum hns_roce_qp_state to_hns_roce_state(enum ib_qp_state state)
@@ -139,50 +204,75 @@
 	}
 }
 
-static int hns_roce_gsi_qp_alloc(struct hns_roce_dev *hr_dev, unsigned long qpn,
-				 struct hns_roce_qp *hr_qp)
+static void add_qp_to_list(struct hns_roce_dev *hr_dev,
+			   struct hns_roce_qp *hr_qp,
+			   struct ib_cq *send_cq, struct ib_cq *recv_cq)
+{
+	struct hns_roce_cq *hr_send_cq, *hr_recv_cq;
+	unsigned long flags;
+
+	hr_send_cq = send_cq ? to_hr_cq(send_cq) : NULL;
+	hr_recv_cq = recv_cq ? to_hr_cq(recv_cq) : NULL;
+
+	spin_lock_irqsave(&hr_dev->qp_list_lock, flags);
+	hns_roce_lock_cqs(hr_send_cq, hr_recv_cq);
+
+	list_add_tail(&hr_qp->node, &hr_dev->qp_list);
+	if (hr_send_cq)
+		list_add_tail(&hr_qp->sq_node, &hr_send_cq->sq_list);
+	if (hr_recv_cq)
+		list_add_tail(&hr_qp->rq_node, &hr_recv_cq->rq_list);
+
+	hns_roce_unlock_cqs(hr_send_cq, hr_recv_cq);
+	spin_unlock_irqrestore(&hr_dev->qp_list_lock, flags);
+}
+
+static int hns_roce_qp_store(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_qp *hr_qp,
+			     struct ib_qp_init_attr *init_attr)
 {
 	struct xarray *xa = &hr_dev->qp_table_xa;
 	int ret;
 
-	if (!qpn)
+	if (!hr_qp->qpn)
 		return -EINVAL;
 
-	hr_qp->qpn = qpn;
-	atomic_set(&hr_qp->refcount, 1);
-	init_completion(&hr_qp->free);
-
-	ret = xa_err(xa_store_irq(xa, hr_qp->qpn & (hr_dev->caps.num_qps - 1),
-				hr_qp, GFP_KERNEL));
+	ret = xa_err(xa_store_irq(xa, hr_qp->qpn, hr_qp, GFP_KERNEL));
 	if (ret)
-		dev_err(hr_dev->dev, "QPC xa_store failed\n");
+		dev_err(hr_dev->dev, "Failed to xa store for QPC\n");
+	else
+		/* add QP to device's QP list for softwc */
+		add_qp_to_list(hr_dev, hr_qp, init_attr->send_cq,
+			       init_attr->recv_cq);
 
 	return ret;
 }
 
-static int hns_roce_qp_alloc(struct hns_roce_dev *hr_dev, unsigned long qpn,
-			     struct hns_roce_qp *hr_qp)
+static int alloc_qpc(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
 	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
 	struct device *dev = hr_dev->dev;
 	int ret;
 
-	if (!qpn)
+	if (!hr_qp->qpn)
 		return -EINVAL;
 
-	hr_qp->qpn = qpn;
+	/* In v1 engine, GSI QP context is saved in the RoCE hw's register */
+	if (hr_qp->ibqp.qp_type == IB_QPT_GSI &&
+	    hr_dev->hw_rev == HNS_ROCE_HW_VER1)
+		return 0;
 
 	/* Alloc memory for QPC */
 	ret = hns_roce_table_get(hr_dev, &qp_table->qp_table, hr_qp->qpn);
 	if (ret) {
-		dev_err(dev, "QPC table get failed\n");
+		dev_err(dev, "Failed to get QPC table\n");
 		goto err_out;
 	}
 
 	/* Alloc memory for IRRL */
 	ret = hns_roce_table_get(hr_dev, &qp_table->irrl_table, hr_qp->qpn);
 	if (ret) {
-		dev_err(dev, "IRRL table get failed\n");
+		dev_err(dev, "Failed to get IRRL table\n");
 		goto err_put_qp;
 	}
 
@@ -191,32 +281,23 @@
 		ret = hns_roce_table_get(hr_dev, &qp_table->trrl_table,
 					 hr_qp->qpn);
 		if (ret) {
-			dev_err(dev, "TRRL table get failed\n");
+			dev_err(dev, "Failed to get TRRL table\n");
 			goto err_put_irrl;
 		}
 	}
 
-	if (hr_dev->caps.sccc_entry_sz) {
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL) {
 		/* Alloc memory for SCC CTX */
 		ret = hns_roce_table_get(hr_dev, &qp_table->sccc_table,
 					 hr_qp->qpn);
 		if (ret) {
-			dev_err(dev, "SCC CTX table get failed\n");
+			dev_err(dev, "Failed to get SCC CTX table\n");
 			goto err_put_trrl;
 		}
 	}
 
-	ret = hns_roce_gsi_qp_alloc(hr_dev, qpn, hr_qp);
-	if (ret)
-		goto err_put_sccc;
-
 	return 0;
 
-err_put_sccc:
-	if (hr_dev->caps.sccc_entry_sz)
-		hns_roce_table_put(hr_dev, &qp_table->sccc_table,
-				   hr_qp->qpn);
-
 err_put_trrl:
 	if (hr_dev->caps.trrl_entry_sz)
 		hns_roce_table_put(hr_dev, &qp_table->trrl_table, hr_qp->qpn);
@@ -236,94 +317,131 @@
 	struct xarray *xa = &hr_dev->qp_table_xa;
 	unsigned long flags;
 
+	list_del(&hr_qp->node);
+	list_del(&hr_qp->sq_node);
+	list_del(&hr_qp->rq_node);
+
 	xa_lock_irqsave(xa, flags);
 	__xa_erase(xa, hr_qp->qpn & (hr_dev->caps.num_qps - 1));
 	xa_unlock_irqrestore(xa, flags);
 }
 
-void hns_roce_qp_free(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
+static void free_qpc(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
 	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
 
-	if (atomic_dec_and_test(&hr_qp->refcount))
-		complete(&hr_qp->free);
-	wait_for_completion(&hr_qp->free);
-
-	if ((hr_qp->ibqp.qp_type) != IB_QPT_GSI) {
-		if (hr_dev->caps.trrl_entry_sz)
-			hns_roce_table_put(hr_dev, &qp_table->trrl_table,
-					   hr_qp->qpn);
-		hns_roce_table_put(hr_dev, &qp_table->irrl_table, hr_qp->qpn);
-	}
-}
-
-void hns_roce_release_range_qp(struct hns_roce_dev *hr_dev, int base_qpn,
-			       int cnt)
-{
-	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
-
-	if (base_qpn < hr_dev->caps.reserved_qps)
+	/* In v1 engine, GSI QP context is saved in the RoCE hw's register */
+	if (hr_qp->ibqp.qp_type == IB_QPT_GSI &&
+	    hr_dev->hw_rev == HNS_ROCE_HW_VER1)
 		return;
 
-	hns_roce_bitmap_free_range(&qp_table->bitmap, base_qpn, cnt, BITMAP_RR);
+	if (hr_dev->caps.trrl_entry_sz)
+		hns_roce_table_put(hr_dev, &qp_table->trrl_table, hr_qp->qpn);
+	hns_roce_table_put(hr_dev, &qp_table->irrl_table, hr_qp->qpn);
 }
 
-static int hns_roce_set_rq_size(struct hns_roce_dev *hr_dev,
-				struct ib_qp_cap *cap, bool is_user, int has_rq,
-				struct hns_roce_qp *hr_qp)
+static void free_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
-	struct device *dev = hr_dev->dev;
-	u32 max_cnt;
+	struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
 
-	/* Check the validity of QP support capacity */
-	if (cap->max_recv_wr > hr_dev->caps.max_wqes ||
-	    cap->max_recv_sge > hr_dev->caps.max_rq_sg) {
-		dev_err(dev, "RQ WR or sge error!max_recv_wr=%d max_recv_sge=%d\n",
-			cap->max_recv_wr, cap->max_recv_sge);
-		return -EINVAL;
-	}
+	if (hr_qp->ibqp.qp_type == IB_QPT_GSI)
+		return;
+
+	if (hr_qp->qpn < hr_dev->caps.reserved_qps)
+		return;
+
+	hns_roce_bitmap_free_range(&qp_table->bitmap, hr_qp->qpn, 1, BITMAP_RR);
+}
+
+static int set_rq_size(struct hns_roce_dev *hr_dev, struct ib_qp_cap *cap,
+		       struct hns_roce_qp *hr_qp, int has_rq)
+{
+	u32 cnt;
 
 	/* If srq exist, set zero for relative number of rq */
 	if (!has_rq) {
 		hr_qp->rq.wqe_cnt = 0;
 		hr_qp->rq.max_gs = 0;
+		hr_qp->rq_inl_buf.wqe_cnt = 0;
 		cap->max_recv_wr = 0;
 		cap->max_recv_sge = 0;
-	} else {
-		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge)) {
-			dev_err(dev, "user space no need config max_recv_wr max_recv_sge\n");
-			return -EINVAL;
-		}
 
-		if (hr_dev->caps.min_wqes)
-			max_cnt = max(cap->max_recv_wr, hr_dev->caps.min_wqes);
-		else
-			max_cnt = cap->max_recv_wr;
-
-		hr_qp->rq.wqe_cnt = roundup_pow_of_two(max_cnt);
-
-		if ((u32)hr_qp->rq.wqe_cnt > hr_dev->caps.max_wqes) {
-			dev_err(dev, "while setting rq size, rq.wqe_cnt too large\n");
-			return -EINVAL;
-		}
-
-		max_cnt = max(1U, cap->max_recv_sge);
-		hr_qp->rq.max_gs = roundup_pow_of_two(max_cnt);
-		if (hr_dev->caps.max_rq_sg <= 2)
-			hr_qp->rq.wqe_shift =
-					ilog2(hr_dev->caps.max_rq_desc_sz);
-		else
-			hr_qp->rq.wqe_shift =
-					ilog2(hr_dev->caps.max_rq_desc_sz
-					      * hr_qp->rq.max_gs);
+		return 0;
 	}
 
-	cap->max_recv_wr = hr_qp->rq.max_post = hr_qp->rq.wqe_cnt;
+	/* Check the validity of QP support capacity */
+	if (!cap->max_recv_wr || cap->max_recv_wr > hr_dev->caps.max_wqes ||
+	    cap->max_recv_sge > hr_dev->caps.max_rq_sg) {
+		ibdev_err(&hr_dev->ib_dev, "RQ config error, depth=%u, sge=%d\n",
+			  cap->max_recv_wr, cap->max_recv_sge);
+		return -EINVAL;
+	}
+
+	cnt = roundup_pow_of_two(max(cap->max_recv_wr, hr_dev->caps.min_wqes));
+	if (cnt > hr_dev->caps.max_wqes) {
+		ibdev_err(&hr_dev->ib_dev, "rq depth %u too large\n",
+			  cap->max_recv_wr);
+		return -EINVAL;
+	}
+
+	hr_qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge));
+
+	if (hr_dev->caps.max_rq_sg <= HNS_ROCE_SGE_IN_WQE)
+		hr_qp->rq.wqe_shift = ilog2(hr_dev->caps.max_rq_desc_sz);
+	else
+		hr_qp->rq.wqe_shift = ilog2(hr_dev->caps.max_rq_desc_sz *
+					    hr_qp->rq.max_gs);
+
+	hr_qp->rq.wqe_cnt = cnt;
+	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE)
+		hr_qp->rq_inl_buf.wqe_cnt = cnt;
+	else
+		hr_qp->rq_inl_buf.wqe_cnt = 0;
+
+	cap->max_recv_wr = cnt;
 	cap->max_recv_sge = hr_qp->rq.max_gs;
 
 	return 0;
 }
 
+static int set_extend_sge_param(struct hns_roce_dev *hr_dev, u32 sq_wqe_cnt,
+				struct hns_roce_qp *hr_qp,
+				struct ib_qp_cap *cap)
+{
+	u32 cnt;
+
+	cnt = max(1U, cap->max_send_sge);
+	if (hr_dev->hw_rev == HNS_ROCE_HW_VER1) {
+		hr_qp->sq.max_gs = roundup_pow_of_two(cnt);
+		hr_qp->sge.sge_cnt = 0;
+
+		return 0;
+	}
+
+	hr_qp->sq.max_gs = cnt;
+
+	/* UD sqwqe's sge use extend sge */
+	if (hr_qp->ibqp.qp_type == IB_QPT_GSI ||
+	    hr_qp->ibqp.qp_type == IB_QPT_UD) {
+		cnt = roundup_pow_of_two(sq_wqe_cnt * hr_qp->sq.max_gs);
+	} else if (hr_qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE) {
+		cnt = roundup_pow_of_two(sq_wqe_cnt *
+				     (hr_qp->sq.max_gs - HNS_ROCE_SGE_IN_WQE));
+	} else {
+		cnt = 0;
+	}
+
+	hr_qp->sge.sge_shift = HNS_ROCE_SGE_SHIFT;
+
+	/* If the number of extended sge is not zero, they MUST use the
+	 * space of HNS_HW_PAGE_SIZE at least.
+	 */
+	hr_qp->sge.sge_cnt = cnt ?
+			max(cnt, (u32)HNS_HW_PAGE_SIZE / HNS_ROCE_SGE_SIZE) : 0;
+
+	return 0;
+}
+
 static int check_sq_size_with_integrity(struct hns_roce_dev *hr_dev,
 					struct ib_qp_cap *cap,
 					struct hns_roce_ib_create_qp *ucmd)
@@ -334,12 +452,12 @@
 	/* Sanity check SQ size before proceeding */
 	if (ucmd->log_sq_stride > max_sq_stride ||
 	    ucmd->log_sq_stride < HNS_ROCE_IB_MIN_SQ_STRIDE) {
-		ibdev_err(&hr_dev->ib_dev, "check SQ size error!\n");
+		ibdev_err(&hr_dev->ib_dev, "failed to check SQ stride size.\n");
 		return -EINVAL;
 	}
 
 	if (cap->max_send_sge > hr_dev->caps.max_sq_sg) {
-		ibdev_err(&hr_dev->ib_dev, "SQ sge error! max_send_sge=%d\n",
+		ibdev_err(&hr_dev->ib_dev, "failed to check SQ SGE size %u.\n",
 			  cap->max_send_sge);
 		return -EINVAL;
 	}
@@ -347,275 +465,120 @@
 	return 0;
 }
 
-static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev,
-				     struct ib_qp_cap *cap,
-				     struct hns_roce_qp *hr_qp,
-				     struct hns_roce_ib_create_qp *ucmd)
+static int set_user_sq_size(struct hns_roce_dev *hr_dev,
+			    struct ib_qp_cap *cap, struct hns_roce_qp *hr_qp,
+			    struct hns_roce_ib_create_qp *ucmd)
 {
-	u32 ex_sge_num;
-	u32 page_size;
-	u32 max_cnt;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	u32 cnt = 0;
 	int ret;
 
-	if (check_shl_overflow(1, ucmd->log_sq_bb_count, &hr_qp->sq.wqe_cnt) ||
-	    hr_qp->sq.wqe_cnt > hr_dev->caps.max_wqes)
+	if (check_shl_overflow(1, ucmd->log_sq_bb_count, &cnt) ||
+	    cnt > hr_dev->caps.max_wqes)
 		return -EINVAL;
 
 	ret = check_sq_size_with_integrity(hr_dev, cap, ucmd);
 	if (ret) {
-		ibdev_err(&hr_dev->ib_dev, "Sanity check sq size failed\n");
+		ibdev_err(ibdev, "failed to check user SQ size, ret = %d.\n",
+			  ret);
 		return ret;
 	}
 
+	ret = set_extend_sge_param(hr_dev, cnt, hr_qp, cap);
+	if (ret)
+		return ret;
+
 	hr_qp->sq.wqe_shift = ucmd->log_sq_stride;
-
-	max_cnt = max(1U, cap->max_send_sge);
-	if (hr_dev->caps.max_sq_sg <= 2)
-		hr_qp->sq.max_gs = roundup_pow_of_two(max_cnt);
-	else
-		hr_qp->sq.max_gs = max_cnt;
-
-	if (hr_qp->sq.max_gs > 2)
-		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
-							(hr_qp->sq.max_gs - 2));
-
-	if ((hr_qp->sq.max_gs > 2) && (hr_dev->pci_dev->revision == 0x20)) {
-		if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) {
-			dev_err(hr_dev->dev,
-				"The extended sge cnt error! sge_cnt=%d\n",
-				hr_qp->sge.sge_cnt);
-			return -EINVAL;
-		}
-	}
-
-	hr_qp->sge.sge_shift = 4;
-	ex_sge_num = hr_qp->sge.sge_cnt;
-
-	/* Get buf size, SQ and RQ  are aligned to page_szie */
-	if (hr_dev->caps.max_sq_sg <= 2) {
-		hr_qp->buff_size = HNS_ROCE_ALOGN_UP((hr_qp->rq.wqe_cnt <<
-					     hr_qp->rq.wqe_shift), PAGE_SIZE) +
-				   HNS_ROCE_ALOGN_UP((hr_qp->sq.wqe_cnt <<
-					     hr_qp->sq.wqe_shift), PAGE_SIZE);
-
-		hr_qp->sq.offset = 0;
-		hr_qp->rq.offset = HNS_ROCE_ALOGN_UP((hr_qp->sq.wqe_cnt <<
-					     hr_qp->sq.wqe_shift), PAGE_SIZE);
-	} else {
-		page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
-		hr_qp->sge.sge_cnt = ex_sge_num ?
-		   max(page_size / (1 << hr_qp->sge.sge_shift), ex_sge_num) : 0;
-		hr_qp->buff_size = HNS_ROCE_ALOGN_UP((hr_qp->rq.wqe_cnt <<
-					     hr_qp->rq.wqe_shift), page_size) +
-				   HNS_ROCE_ALOGN_UP((hr_qp->sge.sge_cnt <<
-					     hr_qp->sge.sge_shift), page_size) +
-				   HNS_ROCE_ALOGN_UP((hr_qp->sq.wqe_cnt <<
-					     hr_qp->sq.wqe_shift), page_size);
-
-		hr_qp->sq.offset = 0;
-		if (ex_sge_num) {
-			hr_qp->sge.offset = HNS_ROCE_ALOGN_UP(
-							(hr_qp->sq.wqe_cnt <<
-							hr_qp->sq.wqe_shift),
-							page_size);
-			hr_qp->rq.offset = hr_qp->sge.offset +
-					HNS_ROCE_ALOGN_UP((hr_qp->sge.sge_cnt <<
-						hr_qp->sge.sge_shift),
-						page_size);
-		} else {
-			hr_qp->rq.offset = HNS_ROCE_ALOGN_UP(
-							(hr_qp->sq.wqe_cnt <<
-							hr_qp->sq.wqe_shift),
-							page_size);
-		}
-	}
+	hr_qp->sq.wqe_cnt = cnt;
 
 	return 0;
 }
 
-static int split_wqe_buf_region(struct hns_roce_dev *hr_dev,
-				struct hns_roce_qp *hr_qp,
-				struct hns_roce_buf_region *regions,
-				int region_max, int page_shift)
+static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev,
+			    struct hns_roce_qp *hr_qp,
+			    struct hns_roce_buf_attr *buf_attr)
 {
-	int page_size = 1 << page_shift;
-	bool is_extend_sge;
-	int region_cnt = 0;
 	int buf_size;
-	int buf_cnt;
+	int idx = 0;
 
-	if (hr_qp->buff_size < 1 || region_max < 1)
-		return region_cnt;
+	hr_qp->buff_size = 0;
 
-	if (hr_qp->sge.sge_cnt > 0)
-		is_extend_sge = true;
-	else
-		is_extend_sge = false;
-
-	/* sq region */
-	if (is_extend_sge)
-		buf_size = hr_qp->sge.offset - hr_qp->sq.offset;
-	else
-		buf_size = hr_qp->rq.offset - hr_qp->sq.offset;
-
-	if (buf_size > 0 && region_cnt < region_max) {
-		buf_cnt = DIV_ROUND_UP(buf_size, page_size);
-		hns_roce_init_buf_region(&regions[region_cnt],
-					 hr_dev->caps.wqe_sq_hop_num,
-					 hr_qp->sq.offset / page_size,
-					 buf_cnt);
-		region_cnt++;
+	/* SQ WQE */
+	hr_qp->sq.offset = 0;
+	buf_size = to_hr_hem_entries_size(hr_qp->sq.wqe_cnt,
+					  hr_qp->sq.wqe_shift);
+	if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) {
+		buf_attr->region[idx].size = buf_size;
+		buf_attr->region[idx].hopnum = hr_dev->caps.wqe_sq_hop_num;
+		idx++;
+		hr_qp->buff_size += buf_size;
 	}
 
-	/* sge region */
-	if (is_extend_sge) {
-		buf_size = hr_qp->rq.offset - hr_qp->sge.offset;
-		if (buf_size > 0 && region_cnt < region_max) {
-			buf_cnt = DIV_ROUND_UP(buf_size, page_size);
-			hns_roce_init_buf_region(&regions[region_cnt],
-						 hr_dev->caps.wqe_sge_hop_num,
-						 hr_qp->sge.offset / page_size,
-						 buf_cnt);
-			region_cnt++;
-		}
+	/* extend SGE WQE in SQ */
+	hr_qp->sge.offset = hr_qp->buff_size;
+	buf_size = to_hr_hem_entries_size(hr_qp->sge.sge_cnt,
+					  hr_qp->sge.sge_shift);
+	if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) {
+		buf_attr->region[idx].size = buf_size;
+		buf_attr->region[idx].hopnum = hr_dev->caps.wqe_sge_hop_num;
+		idx++;
+		hr_qp->buff_size += buf_size;
 	}
 
-	/* rq region */
-	buf_size = hr_qp->buff_size - hr_qp->rq.offset;
-	if (buf_size > 0) {
-		buf_cnt = DIV_ROUND_UP(buf_size, page_size);
-		hns_roce_init_buf_region(&regions[region_cnt],
-					 hr_dev->caps.wqe_rq_hop_num,
-					 hr_qp->rq.offset / page_size,
-					 buf_cnt);
-		region_cnt++;
+	/* RQ WQE */
+	hr_qp->rq.offset = hr_qp->buff_size;
+	buf_size = to_hr_hem_entries_size(hr_qp->rq.wqe_cnt,
+					  hr_qp->rq.wqe_shift);
+	if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) {
+		buf_attr->region[idx].size = buf_size;
+		buf_attr->region[idx].hopnum = hr_dev->caps.wqe_rq_hop_num;
+		idx++;
+		hr_qp->buff_size += buf_size;
 	}
 
-	return region_cnt;
-}
+	if (hr_qp->buff_size < 1)
+		return -EINVAL;
 
-static int calc_wqe_bt_page_shift(struct hns_roce_dev *hr_dev,
-				  struct hns_roce_buf_region *regions,
-				  int region_cnt)
-{
-	int bt_pg_shift;
-	int ba_num;
-	int ret;
-
-	bt_pg_shift = PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz;
-
-	/* all root ba entries must in one bt page */
-	do {
-		ba_num = (1 << bt_pg_shift) / BA_BYTE_LEN;
-		ret = hns_roce_hem_list_calc_root_ba(regions, region_cnt,
-						     ba_num);
-		if (ret <= ba_num)
-			break;
-
-		bt_pg_shift++;
-	} while (ret > ba_num);
-
-	return bt_pg_shift - PAGE_SHIFT;
-}
-
-static int set_extend_sge_param(struct hns_roce_dev *hr_dev,
-				struct hns_roce_qp *hr_qp)
-{
-	struct device *dev = hr_dev->dev;
-
-	if (hr_qp->sq.max_gs > 2) {
-		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
-				     (hr_qp->sq.max_gs - 2));
-		hr_qp->sge.sge_shift = 4;
-	}
-
-	/* ud sqwqe's sge use extend sge */
-	if (hr_dev->caps.max_sq_sg > 2 && hr_qp->ibqp.qp_type == IB_QPT_GSI) {
-		hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt *
-				     hr_qp->sq.max_gs);
-		hr_qp->sge.sge_shift = 4;
-	}
-
-	if ((hr_qp->sq.max_gs > 2) && hr_dev->pci_dev->revision == 0x20) {
-		if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) {
-			dev_err(dev, "The extended sge cnt error! sge_cnt=%d\n",
-				hr_qp->sge.sge_cnt);
-			return -EINVAL;
-		}
-	}
+	buf_attr->page_shift = HNS_HW_PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz;
+	buf_attr->fixed_page = true;
+	buf_attr->region_count = idx;
 
 	return 0;
 }
 
-static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev,
-				       struct ib_qp_cap *cap,
-				       struct hns_roce_qp *hr_qp)
+static int set_kernel_sq_size(struct hns_roce_dev *hr_dev,
+			      struct ib_qp_cap *cap, struct hns_roce_qp *hr_qp)
 {
-	struct device *dev = hr_dev->dev;
-	u32 page_size;
-	u32 max_cnt;
-	int size;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	u32 cnt;
 	int ret;
 
-	if (cap->max_send_wr  > hr_dev->caps.max_wqes  ||
-	    cap->max_send_sge > hr_dev->caps.max_sq_sg ||
-	    cap->max_inline_data > hr_dev->caps.max_sq_inline) {
-		dev_err(dev, "SQ WR or sge or inline data error!\n");
+	if (!cap->max_send_wr || cap->max_send_wr > hr_dev->caps.max_wqes ||
+	    cap->max_send_sge > hr_dev->caps.max_sq_sg) {
+		ibdev_err(ibdev,
+			  "failed to check SQ WR or SGE num, ret = %d.\n",
+			  -EINVAL);
+		return -EINVAL;
+	}
+
+	cnt = roundup_pow_of_two(max(cap->max_send_wr, hr_dev->caps.min_wqes));
+	if (cnt > hr_dev->caps.max_wqes) {
+		ibdev_err(ibdev, "failed to check WQE num, WQE num = %u.\n",
+			  cnt);
 		return -EINVAL;
 	}
 
 	hr_qp->sq.wqe_shift = ilog2(hr_dev->caps.max_sq_desc_sz);
+	hr_qp->sq.wqe_cnt = cnt;
 
-	if (hr_dev->caps.min_wqes)
-		max_cnt = max(cap->max_send_wr, hr_dev->caps.min_wqes);
-	else
-		max_cnt = cap->max_send_wr;
-
-	hr_qp->sq.wqe_cnt = roundup_pow_of_two(max_cnt);
-	if ((u32)hr_qp->sq.wqe_cnt > hr_dev->caps.max_wqes) {
-		dev_err(dev, "while setting kernel sq size, sq.wqe_cnt too large\n");
-		return -EINVAL;
-	}
-
-	/* Get data_seg numbers */
-	max_cnt = max(1U, cap->max_send_sge);
-	if (hr_dev->caps.max_sq_sg <= 2)
-		hr_qp->sq.max_gs = roundup_pow_of_two(max_cnt);
-	else
-		hr_qp->sq.max_gs = max_cnt;
-
-	ret = set_extend_sge_param(hr_dev, hr_qp);
-	if (ret) {
-		dev_err(dev, "set extend sge parameters fail\n");
+	ret = set_extend_sge_param(hr_dev, cnt, hr_qp, cap);
+	if (ret)
 		return ret;
-	}
 
-	/* Get buf size, SQ and RQ are aligned to PAGE_SIZE */
-	page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
-	hr_qp->sq.offset = 0;
-	size = HNS_ROCE_ALOGN_UP(hr_qp->sq.wqe_cnt << hr_qp->sq.wqe_shift,
-				 page_size);
-
-	if (hr_dev->caps.max_sq_sg > 2 && hr_qp->sge.sge_cnt) {
-		hr_qp->sge.sge_cnt = max(page_size/(1 << hr_qp->sge.sge_shift),
-					(u32)hr_qp->sge.sge_cnt);
-		hr_qp->sge.offset = size;
-		size += HNS_ROCE_ALOGN_UP(hr_qp->sge.sge_cnt <<
-					  hr_qp->sge.sge_shift, page_size);
-	}
-
-	hr_qp->rq.offset = size;
-	size += HNS_ROCE_ALOGN_UP((hr_qp->rq.wqe_cnt << hr_qp->rq.wqe_shift),
-				  page_size);
-	hr_qp->buff_size = size;
-
-	/* Get wr and sge number which send */
-	cap->max_send_wr = hr_qp->sq.max_post = hr_qp->sq.wqe_cnt;
+	/* sync the parameters of kernel QP to user's configuration */
+	cap->max_send_wr = cnt;
 	cap->max_send_sge = hr_qp->sq.max_gs;
 
-	/* We don't support inline sends for kernel QPs (yet) */
-	cap->max_inline_data = 0;
-
 	return 0;
 }
 
@@ -641,8 +604,8 @@
 			       struct ib_qp_init_attr *init_attr)
 {
 	u32 max_recv_sge = init_attr->cap.max_recv_sge;
+	u32 wqe_cnt = hr_qp->rq_inl_buf.wqe_cnt;
 	struct hns_roce_rinl_wqe *wqe_list;
-	u32 wqe_cnt = hr_qp->rq.wqe_cnt;
 	int i;
 
 	/* allocate recv inline buf */
@@ -664,7 +627,6 @@
 		wqe_list[i].sg_list = &wqe_list[0].sg_list[i * max_recv_sge];
 
 	hr_qp->rq_inl_buf.wqe_list = wqe_list;
-	hr_qp->rq_inl_buf.wqe_cnt = wqe_cnt;
 
 	return 0;
 
@@ -677,410 +639,423 @@
 
 static void free_rq_inline_buf(struct hns_roce_qp *hr_qp)
 {
-	kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
+	if (hr_qp->rq_inl_buf.wqe_list)
+		kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list);
 	kfree(hr_qp->rq_inl_buf.wqe_list);
 }
 
-static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
-				     struct ib_pd *ib_pd,
-				     struct ib_qp_init_attr *init_attr,
-				     struct ib_udata *udata, unsigned long sqpn,
-				     struct hns_roce_qp *hr_qp)
+static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
+			struct ib_qp_init_attr *init_attr,
+			struct ib_udata *udata, unsigned long addr)
 {
-	dma_addr_t *buf_list[ARRAY_SIZE(hr_qp->regions)] = { NULL };
-	struct device *dev = hr_dev->dev;
-	struct hns_roce_ib_create_qp ucmd;
-	struct hns_roce_ib_create_qp_resp resp = {};
-	struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(
-		udata, struct hns_roce_ucontext, ibucontext);
-	struct hns_roce_buf_region *r;
-	unsigned long qpn = 0;
-	u32 page_shift;
-	int buf_count;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_buf_attr buf_attr = {};
 	int ret;
-	int i;
 
-	mutex_init(&hr_qp->mutex);
-	spin_lock_init(&hr_qp->sq.lock);
-	spin_lock_init(&hr_qp->rq.lock);
-
-	hr_qp->state = IB_QPS_RESET;
-
-	hr_qp->ibqp.qp_type = init_attr->qp_type;
-
-	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
-		hr_qp->sq_signal_bits = IB_SIGNAL_ALL_WR;
-	else
-		hr_qp->sq_signal_bits = IB_SIGNAL_REQ_WR;
-
-	ret = hns_roce_set_rq_size(hr_dev, &init_attr->cap, udata,
-				   hns_roce_qp_has_rq(init_attr), hr_qp);
-	if (ret) {
-		dev_err(dev, "hns_roce_set_rq_size failed\n");
-		goto err_out;
-	}
-
-	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
-	    hns_roce_qp_has_rq(init_attr)) {
+	if (!udata && hr_qp->rq_inl_buf.wqe_cnt) {
 		ret = alloc_rq_inline_buf(hr_qp, init_attr);
 		if (ret) {
-			dev_err(dev, "allocate receive inline buffer failed\n");
-			goto err_out;
-		}
-	}
-
-	page_shift = PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz;
-	if (udata) {
-		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
-			dev_err(dev, "ib_copy_from_udata error for create qp\n");
-			ret = -EFAULT;
-			goto err_alloc_rq_inline_buf;
-		}
-
-		ret = hns_roce_set_user_sq_size(hr_dev, &init_attr->cap, hr_qp,
-						&ucmd);
-		if (ret) {
-			dev_err(dev, "hns_roce_set_user_sq_size error for create qp\n");
-			goto err_alloc_rq_inline_buf;
-		}
-
-		hr_qp->umem = ib_umem_get(udata, ucmd.buf_addr,
-					  hr_qp->buff_size, 0, 0);
-		if (IS_ERR(hr_qp->umem)) {
-			dev_err(dev, "ib_umem_get error for create qp\n");
-			ret = PTR_ERR(hr_qp->umem);
-			goto err_alloc_rq_inline_buf;
-		}
-		hr_qp->region_cnt = split_wqe_buf_region(hr_dev, hr_qp,
-				hr_qp->regions, ARRAY_SIZE(hr_qp->regions),
-				page_shift);
-		ret = hns_roce_alloc_buf_list(hr_qp->regions, buf_list,
-					      hr_qp->region_cnt);
-		if (ret) {
-			dev_err(dev, "alloc buf_list error for create qp\n");
-			goto err_alloc_list;
-		}
-
-		for (i = 0; i < hr_qp->region_cnt; i++) {
-			r = &hr_qp->regions[i];
-			buf_count = hns_roce_get_umem_bufs(hr_dev,
-					buf_list[i], r->count, r->offset,
-					hr_qp->umem, page_shift);
-			if (buf_count != r->count) {
-				dev_err(dev,
-					"get umem buf err, expect %d,ret %d.\n",
-					r->count, buf_count);
-				ret = -ENOBUFS;
-				goto err_get_bufs;
-			}
-		}
-
-		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) &&
-		    (udata->inlen >= sizeof(ucmd)) &&
-		    (udata->outlen >= sizeof(resp)) &&
-		    hns_roce_qp_has_sq(init_attr)) {
-			ret = hns_roce_db_map_user(uctx, udata, ucmd.sdb_addr,
-						   &hr_qp->sdb);
-			if (ret) {
-				dev_err(dev, "sq record doorbell map failed!\n");
-				goto err_get_bufs;
-			}
-
-			/* indicate kernel supports sq record db */
-			resp.cap_flags |= HNS_ROCE_SUPPORT_SQ_RECORD_DB;
-			hr_qp->sdb_en = 1;
-		}
-
-		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
-		    (udata->outlen >= sizeof(resp)) &&
-		    hns_roce_qp_has_rq(init_attr)) {
-			ret = hns_roce_db_map_user(uctx, udata, ucmd.db_addr,
-						   &hr_qp->rdb);
-			if (ret) {
-				dev_err(dev, "rq record doorbell map failed!\n");
-				goto err_sq_dbmap;
-			}
-
-			/* indicate kernel supports rq record db */
-			resp.cap_flags |= HNS_ROCE_SUPPORT_RQ_RECORD_DB;
-			hr_qp->rdb_en = 1;
+			ibdev_err(ibdev,
+				  "failed to alloc inline buf, ret = %d.\n",
+				  ret);
+			return ret;
 		}
 	} else {
-		if (init_attr->create_flags &
-		    IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
-			dev_err(dev, "init_attr->create_flags error!\n");
-			ret = -EINVAL;
-			goto err_alloc_rq_inline_buf;
+		hr_qp->rq_inl_buf.wqe_list = NULL;
+	}
+
+	ret = set_wqe_buf_attr(hr_dev, hr_qp, &buf_attr);
+	if (ret) {
+		ibdev_err(ibdev, "failed to split WQE buf, ret = %d.\n", ret);
+		goto err_inline;
+	}
+	ret = hns_roce_mtr_create(hr_dev, &hr_qp->mtr, &buf_attr,
+				  HNS_HW_PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz,
+				  udata, addr);
+	if (ret) {
+		ibdev_err(ibdev, "failed to create WQE mtr, ret = %d.\n", ret);
+		goto err_inline;
+	}
+
+	return 0;
+err_inline:
+	free_rq_inline_buf(hr_qp);
+
+	return ret;
+}
+
+static void free_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
+{
+	hns_roce_mtr_destroy(hr_dev, &hr_qp->mtr);
+	free_rq_inline_buf(hr_qp);
+}
+
+static inline bool user_qp_has_sdb(struct hns_roce_dev *hr_dev,
+				   struct ib_qp_init_attr *init_attr,
+				   struct ib_udata *udata,
+				   struct hns_roce_ib_create_qp_resp *resp,
+				   struct hns_roce_ib_create_qp *ucmd)
+{
+	return ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) &&
+		udata->outlen >= offsetofend(typeof(*resp), cap_flags) &&
+		hns_roce_qp_has_sq(init_attr) &&
+		udata->inlen >= offsetofend(typeof(*ucmd), sdb_addr));
+}
+
+static inline bool user_qp_has_rdb(struct hns_roce_dev *hr_dev,
+				   struct ib_qp_init_attr *init_attr,
+				   struct ib_udata *udata,
+				   struct hns_roce_ib_create_qp_resp *resp)
+{
+	return ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+		udata->outlen >= offsetofend(typeof(*resp), cap_flags) &&
+		hns_roce_qp_has_rq(init_attr));
+}
+
+static inline bool kernel_qp_has_rdb(struct hns_roce_dev *hr_dev,
+				     struct ib_qp_init_attr *init_attr)
+{
+	return ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+		hns_roce_qp_has_rq(init_attr));
+}
+
+static int alloc_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
+		       struct ib_qp_init_attr *init_attr,
+		       struct ib_udata *udata,
+		       struct hns_roce_ib_create_qp *ucmd,
+		       struct hns_roce_ib_create_qp_resp *resp)
+{
+	struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(
+		udata, struct hns_roce_ucontext, ibucontext);
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	int ret;
+
+	if (udata) {
+		if (user_qp_has_sdb(hr_dev, init_attr, udata, resp, ucmd)) {
+			ret = hns_roce_db_map_user(uctx, udata, ucmd->sdb_addr,
+						   &hr_qp->sdb);
+			if (ret) {
+				ibdev_err(ibdev,
+					  "failed to map user SQ doorbell, ret = %d.\n",
+					  ret);
+				goto err_out;
+			}
+			hr_qp->en_flags |= HNS_ROCE_QP_CAP_SQ_RECORD_DB;
 		}
 
-		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) {
-			dev_err(dev, "init_attr->create_flags error!\n");
-			ret = -EINVAL;
-			goto err_alloc_rq_inline_buf;
+		if (user_qp_has_rdb(hr_dev, init_attr, udata, resp)) {
+			ret = hns_roce_db_map_user(uctx, udata, ucmd->db_addr,
+						   &hr_qp->rdb);
+			if (ret) {
+				ibdev_err(ibdev,
+					  "failed to map user RQ doorbell, ret = %d.\n",
+					  ret);
+				goto err_sdb;
+			}
+			hr_qp->en_flags |= HNS_ROCE_QP_CAP_RQ_RECORD_DB;
 		}
-
-		/* Set SQ size */
-		ret = hns_roce_set_kernel_sq_size(hr_dev, &init_attr->cap,
-						  hr_qp);
-		if (ret) {
-			dev_err(dev, "hns_roce_set_kernel_sq_size error!\n");
-			goto err_alloc_rq_inline_buf;
-		}
-
+	} else {
 		/* QP doorbell register address */
 		hr_qp->sq.db_reg_l = hr_dev->reg_base + hr_dev->sdb_offset +
 				     DB_REG_OFFSET * hr_dev->priv_uar.index;
 		hr_qp->rq.db_reg_l = hr_dev->reg_base + hr_dev->odb_offset +
 				     DB_REG_OFFSET * hr_dev->priv_uar.index;
 
-		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
-		    hns_roce_qp_has_rq(init_attr)) {
+		if (kernel_qp_has_rdb(hr_dev, init_attr)) {
 			ret = hns_roce_alloc_db(hr_dev, &hr_qp->rdb, 0);
 			if (ret) {
-				dev_err(dev, "rq record doorbell alloc failed!\n");
-				goto err_alloc_rq_inline_buf;
+				ibdev_err(ibdev,
+					  "failed to alloc kernel RQ doorbell, ret = %d.\n",
+					  ret);
+				goto err_out;
 			}
 			*hr_qp->rdb.db_record = 0;
-			hr_qp->rdb_en = 1;
-		}
-
-		/* Allocate QP buf */
-		if (hns_roce_buf_alloc(hr_dev, hr_qp->buff_size,
-				       (1 << page_shift) * 2,
-				       &hr_qp->hr_buf, page_shift)) {
-			dev_err(dev, "hns_roce_buf_alloc error!\n");
-			ret = -ENOMEM;
-			goto err_db;
-		}
-		hr_qp->region_cnt = split_wqe_buf_region(hr_dev, hr_qp,
-				hr_qp->regions, ARRAY_SIZE(hr_qp->regions),
-				page_shift);
-		ret = hns_roce_alloc_buf_list(hr_qp->regions, buf_list,
-					      hr_qp->region_cnt);
-		if (ret) {
-			dev_err(dev, "alloc buf_list error for create qp!\n");
-			goto err_alloc_list;
-		}
-
-		for (i = 0; i < hr_qp->region_cnt; i++) {
-			r = &hr_qp->regions[i];
-			buf_count = hns_roce_get_kmem_bufs(hr_dev,
-					buf_list[i], r->count, r->offset,
-					&hr_qp->hr_buf);
-			if (buf_count != r->count) {
-				dev_err(dev,
-					"get kmem buf err, expect %d,ret %d.\n",
-					r->count, buf_count);
-				ret = -ENOBUFS;
-				goto err_get_bufs;
-			}
-		}
-
-		hr_qp->sq.wrid = kcalloc(hr_qp->sq.wqe_cnt, sizeof(u64),
-					 GFP_KERNEL);
-		if (ZERO_OR_NULL_PTR(hr_qp->sq.wrid)) {
-			ret = -ENOMEM;
-			goto err_get_bufs;
-		}
-
-		if (hr_qp->rq.wqe_cnt) {
-			hr_qp->rq.wrid = kcalloc(hr_qp->rq.wqe_cnt, sizeof(u64),
-						 GFP_KERNEL);
-			if (ZERO_OR_NULL_PTR(hr_qp->rq.wrid)) {
-				ret = -ENOMEM;
-				goto err_sq_wrid;
-			}
+			hr_qp->en_flags |= HNS_ROCE_QP_CAP_RQ_RECORD_DB;
 		}
 	}
 
-	if (sqpn) {
-		qpn = sqpn;
-	} else {
-		/* Get QPN */
-		ret = hns_roce_reserve_range_qp(hr_dev, 1, 1, &qpn);
-		if (ret) {
-			dev_err(dev, "hns_roce_reserve_range_qp alloc qpn error\n");
-			goto err_wrid;
-		}
-	}
+	return 0;
+err_sdb:
+	if (udata && hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB)
+		hns_roce_db_unmap_user(uctx, &hr_qp->sdb);
+err_out:
+	return ret;
+}
 
-	hr_qp->wqe_bt_pg_shift = calc_wqe_bt_page_shift(hr_dev, hr_qp->regions,
-							hr_qp->region_cnt);
-	hns_roce_mtr_init(&hr_qp->mtr, PAGE_SHIFT + hr_qp->wqe_bt_pg_shift,
-			  page_shift);
-	ret = hns_roce_mtr_attach(hr_dev, &hr_qp->mtr, buf_list,
-				  hr_qp->regions, hr_qp->region_cnt);
-	if (ret) {
-		dev_err(dev, "mtr attach error for create qp\n");
-		goto err_mtr;
-	}
-
-	if (init_attr->qp_type == IB_QPT_GSI &&
-	    hr_dev->hw_rev == HNS_ROCE_HW_VER1) {
-		/* In v1 engine, GSI QP context in RoCE engine's register */
-		ret = hns_roce_gsi_qp_alloc(hr_dev, qpn, hr_qp);
-		if (ret) {
-			dev_err(dev, "hns_roce_qp_alloc failed!\n");
-			goto err_qpn;
-		}
-	} else {
-		ret = hns_roce_qp_alloc(hr_dev, qpn, hr_qp);
-		if (ret) {
-			dev_err(dev, "hns_roce_qp_alloc failed!\n");
-			goto err_qpn;
-		}
-	}
-
-	if (sqpn)
-		hr_qp->doorbell_qpn = 1;
-	else
-		hr_qp->doorbell_qpn = (u32)hr_qp->qpn;
+static void free_qp_db(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
+		       struct ib_udata *udata)
+{
+	struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(
+		udata, struct hns_roce_ucontext, ibucontext);
 
 	if (udata) {
+		if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB)
+			hns_roce_db_unmap_user(uctx, &hr_qp->rdb);
+		if (hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB)
+			hns_roce_db_unmap_user(uctx, &hr_qp->sdb);
+	} else {
+		if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB)
+			hns_roce_free_db(hr_dev, &hr_qp->rdb);
+	}
+}
+
+static int alloc_kernel_wrid(struct hns_roce_dev *hr_dev,
+			     struct hns_roce_qp *hr_qp)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	u64 *sq_wrid = NULL;
+	u64 *rq_wrid = NULL;
+	int ret;
+
+	sq_wrid = kcalloc(hr_qp->sq.wqe_cnt, sizeof(u64), GFP_KERNEL);
+	if (ZERO_OR_NULL_PTR(sq_wrid)) {
+		ibdev_err(ibdev, "failed to alloc SQ wrid.\n");
+		return -ENOMEM;
+	}
+
+	if (hr_qp->rq.wqe_cnt) {
+		rq_wrid = kcalloc(hr_qp->rq.wqe_cnt, sizeof(u64), GFP_KERNEL);
+		if (ZERO_OR_NULL_PTR(rq_wrid)) {
+			ibdev_err(ibdev, "failed to alloc RQ wrid.\n");
+			ret = -ENOMEM;
+			goto err_sq;
+		}
+	}
+
+	hr_qp->sq.wrid = sq_wrid;
+	hr_qp->rq.wrid = rq_wrid;
+	return 0;
+err_sq:
+	kfree(sq_wrid);
+
+	return ret;
+}
+
+static void free_kernel_wrid(struct hns_roce_qp *hr_qp)
+{
+	kfree(hr_qp->rq.wrid);
+	kfree(hr_qp->sq.wrid);
+}
+
+static int set_qp_param(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
+			struct ib_qp_init_attr *init_attr,
+			struct ib_udata *udata,
+			struct hns_roce_ib_create_qp *ucmd)
+{
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	int ret;
+
+	hr_qp->ibqp.qp_type = init_attr->qp_type;
+
+	if (init_attr->cap.max_inline_data > hr_dev->caps.max_sq_inline)
+		init_attr->cap.max_inline_data = hr_dev->caps.max_sq_inline;
+
+	hr_qp->max_inline_data = init_attr->cap.max_inline_data;
+
+	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		hr_qp->sq_signal_bits = IB_SIGNAL_ALL_WR;
+	else
+		hr_qp->sq_signal_bits = IB_SIGNAL_REQ_WR;
+
+	ret = set_rq_size(hr_dev, &init_attr->cap, hr_qp,
+			  hns_roce_qp_has_rq(init_attr));
+	if (ret) {
+		ibdev_err(ibdev, "failed to set user RQ size, ret = %d.\n",
+			  ret);
+		return ret;
+	}
+
+	if (udata) {
+		ret = ib_copy_from_udata(ucmd, udata,
+					 min(udata->inlen, sizeof(*ucmd)));
+		if (ret) {
+			ibdev_err(ibdev,
+				  "failed to copy QP ucmd, ret = %d\n", ret);
+			return ret;
+		}
+
+		ret = set_user_sq_size(hr_dev, &init_attr->cap, hr_qp, ucmd);
+		if (ret)
+			ibdev_err(ibdev,
+				  "failed to set user SQ size, ret = %d.\n",
+				  ret);
+	} else {
+		if (init_attr->create_flags &
+		    IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
+			ibdev_err(ibdev, "Failed to check multicast loopback\n");
+			return -EINVAL;
+		}
+
+		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) {
+			ibdev_err(ibdev, "Failed to check ipoib ud lso\n");
+			return -EINVAL;
+		}
+
+		ret = set_kernel_sq_size(hr_dev, &init_attr->cap, hr_qp);
+		if (ret)
+			ibdev_err(ibdev,
+				  "failed to set kernel SQ size, ret = %d.\n",
+				  ret);
+	}
+
+	return ret;
+}
+
+static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
+				     struct ib_pd *ib_pd,
+				     struct ib_qp_init_attr *init_attr,
+				     struct ib_udata *udata,
+				     struct hns_roce_qp *hr_qp)
+{
+	struct hns_roce_ib_create_qp_resp resp = {};
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_ib_create_qp ucmd;
+	int ret;
+
+	mutex_init(&hr_qp->mutex);
+	spin_lock_init(&hr_qp->sq.lock);
+	spin_lock_init(&hr_qp->rq.lock);
+
+	hr_qp->state = IB_QPS_RESET;
+	hr_qp->flush_flag = 0;
+
+	ret = set_qp_param(hr_dev, hr_qp, init_attr, udata, &ucmd);
+	if (ret) {
+		ibdev_err(ibdev, "failed to set QP param, ret = %d.\n", ret);
+		return ret;
+	}
+
+	if (!udata) {
+		ret = alloc_kernel_wrid(hr_dev, hr_qp);
+		if (ret) {
+			ibdev_err(ibdev, "failed to alloc wrid, ret = %d.\n",
+				  ret);
+			return ret;
+		}
+	}
+
+	ret = alloc_qp_db(hr_dev, hr_qp, init_attr, udata, &ucmd, &resp);
+	if (ret) {
+		ibdev_err(ibdev, "failed to alloc QP doorbell, ret = %d.\n",
+			  ret);
+		goto err_wrid;
+	}
+
+	ret = alloc_qp_buf(hr_dev, hr_qp, init_attr, udata, ucmd.buf_addr);
+	if (ret) {
+		ibdev_err(ibdev, "failed to alloc QP buffer, ret = %d.\n", ret);
+		goto err_db;
+	}
+
+	ret = alloc_qpn(hr_dev, hr_qp);
+	if (ret) {
+		ibdev_err(ibdev, "failed to alloc QPN, ret = %d.\n", ret);
+		goto err_buf;
+	}
+
+	ret = alloc_qpc(hr_dev, hr_qp);
+	if (ret) {
+		ibdev_err(ibdev, "failed to alloc QP context, ret = %d.\n",
+			  ret);
+		goto err_qpn;
+	}
+
+	ret = hns_roce_qp_store(hr_dev, hr_qp, init_attr);
+	if (ret) {
+		ibdev_err(ibdev, "failed to store QP, ret = %d.\n", ret);
+		goto err_qpc;
+	}
+
+	if (udata) {
+		resp.cap_flags = hr_qp->en_flags;
 		ret = ib_copy_to_udata(udata, &resp,
 				       min(udata->outlen, sizeof(resp)));
-		if (ret)
-			goto err_qp;
+		if (ret) {
+			ibdev_err(ibdev, "copy qp resp failed!\n");
+			goto err_store;
+		}
 	}
 
 	if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_QP_FLOW_CTRL) {
 		ret = hr_dev->hw->qp_flow_control_init(hr_dev, hr_qp);
 		if (ret)
-			goto err_qp;
+			goto err_store;
 	}
 
+	hr_qp->ibqp.qp_num = hr_qp->qpn;
 	hr_qp->event = hns_roce_ib_qp_event;
-	hns_roce_free_buf_list(buf_list, hr_qp->region_cnt);
+	atomic_set(&hr_qp->refcount, 1);
+	init_completion(&hr_qp->free);
 
 	return 0;
 
-err_qp:
-	if (init_attr->qp_type == IB_QPT_GSI &&
-		hr_dev->hw_rev == HNS_ROCE_HW_VER1)
-		hns_roce_qp_remove(hr_dev, hr_qp);
-	else
-		hns_roce_qp_free(hr_dev, hr_qp);
-
+err_store:
+	hns_roce_qp_remove(hr_dev, hr_qp);
+err_qpc:
+	free_qpc(hr_dev, hr_qp);
 err_qpn:
-	if (!sqpn)
-		hns_roce_release_range_qp(hr_dev, qpn, 1);
-
-err_mtr:
-	hns_roce_mtr_cleanup(hr_dev, &hr_qp->mtr);
-
-err_wrid:
-	if (udata) {
-		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
-		    (udata->outlen >= sizeof(resp)) &&
-		    hns_roce_qp_has_rq(init_attr))
-			hns_roce_db_unmap_user(uctx, &hr_qp->rdb);
-	} else {
-		if (hr_qp->rq.wqe_cnt)
-			kfree(hr_qp->rq.wrid);
-	}
-
-err_sq_dbmap:
-	if (udata)
-		if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_SQ_RECORD_DB) &&
-		    (udata->inlen >= sizeof(ucmd)) &&
-		    (udata->outlen >= sizeof(resp)) &&
-		    hns_roce_qp_has_sq(init_attr))
-			hns_roce_db_unmap_user(uctx, &hr_qp->sdb);
-
-err_sq_wrid:
-	if (!udata)
-		kfree(hr_qp->sq.wrid);
-
-err_get_bufs:
-	hns_roce_free_buf_list(buf_list, hr_qp->region_cnt);
-
-err_alloc_list:
-	if (!hr_qp->umem)
-		hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
-	ib_umem_release(hr_qp->umem);
-
+	free_qpn(hr_dev, hr_qp);
+err_buf:
+	free_qp_buf(hr_dev, hr_qp);
 err_db:
-	if (!udata && hns_roce_qp_has_rq(init_attr) &&
-	    (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB))
-		hns_roce_free_db(hr_dev, &hr_qp->rdb);
-
-err_alloc_rq_inline_buf:
-	if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) &&
-	     hns_roce_qp_has_rq(init_attr))
-		free_rq_inline_buf(hr_qp);
-
-err_out:
+	free_qp_db(hr_dev, hr_qp, udata);
+err_wrid:
+	free_kernel_wrid(hr_qp);
 	return ret;
 }
 
+void hns_roce_qp_destroy(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
+			 struct ib_udata *udata)
+{
+	if (atomic_dec_and_test(&hr_qp->refcount))
+		complete(&hr_qp->free);
+	wait_for_completion(&hr_qp->free);
+
+	free_qpc(hr_dev, hr_qp);
+	free_qpn(hr_dev, hr_qp);
+	free_qp_buf(hr_dev, hr_qp);
+	free_kernel_wrid(hr_qp);
+	free_qp_db(hr_dev, hr_qp, udata);
+
+	kfree(hr_qp);
+}
+
 struct ib_qp *hns_roce_create_qp(struct ib_pd *pd,
 				 struct ib_qp_init_attr *init_attr,
 				 struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
 	struct ib_device *ibdev = &hr_dev->ib_dev;
-	struct hns_roce_sqp *hr_sqp;
 	struct hns_roce_qp *hr_qp;
 	int ret;
 
 	switch (init_attr->qp_type) {
-	case IB_QPT_RC: {
-		hr_qp = kzalloc(sizeof(*hr_qp), GFP_KERNEL);
-		if (!hr_qp)
-			return ERR_PTR(-ENOMEM);
-
-		ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata, 0,
-						hr_qp);
-		if (ret) {
-			ibdev_err(ibdev, "Create RC QP 0x%06lx failed(%d)\n",
-				  hr_qp->qpn, ret);
-			kfree(hr_qp);
-			return ERR_PTR(ret);
-		}
-
-		hr_qp->ibqp.qp_num = hr_qp->qpn;
-
+	case IB_QPT_RC:
+	case IB_QPT_GSI:
 		break;
-	}
-	case IB_QPT_GSI: {
-		/* Userspace is not allowed to create special QPs: */
-		if (udata) {
-			ibdev_err(ibdev, "not support usr space GSI\n");
-			return ERR_PTR(-EINVAL);
-		}
-
-		hr_sqp = kzalloc(sizeof(*hr_sqp), GFP_KERNEL);
-		if (!hr_sqp)
-			return ERR_PTR(-ENOMEM);
-
-		hr_qp = &hr_sqp->hr_qp;
-		hr_qp->port = init_attr->port_num - 1;
-		hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port];
-
-		/* when hw version is v1, the sqpn is allocated */
-		if (hr_dev->caps.max_sq_sg <= 2)
-			hr_qp->ibqp.qp_num = HNS_ROCE_MAX_PORTS +
-					     hr_dev->iboe.phy_port[hr_qp->port];
-		else
-			hr_qp->ibqp.qp_num = 1;
-
-		ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata,
-						hr_qp->ibqp.qp_num, hr_qp);
-		if (ret) {
-			ibdev_err(ibdev, "Create GSI QP failed!\n");
-			kfree(hr_sqp);
-			return ERR_PTR(ret);
-		}
-
-		break;
-	}
-	default:{
+	default:
 		ibdev_err(ibdev, "not support QP type %d\n",
 			  init_attr->qp_type);
-		return ERR_PTR(-EINVAL);
-	}
+		return ERR_PTR(-EOPNOTSUPP);
 	}
 
+	hr_qp = kzalloc(sizeof(*hr_qp), GFP_KERNEL);
+	if (!hr_qp)
+		return ERR_PTR(-ENOMEM);
+
+	if (init_attr->qp_type == IB_QPT_GSI) {
+		hr_qp->port = init_attr->port_num - 1;
+		hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port];
+	}
+
+	ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata, hr_qp);
+	if (ret) {
+		ibdev_err(ibdev, "Create QP type 0x%x failed(%d)\n",
+			  init_attr->qp_type, ret);
+		ibdev_err(ibdev, "Create GSI QP failed!\n");
+		kfree(hr_qp);
+		return ERR_PTR(ret);
+	}
 	return &hr_qp->ibqp;
 }
 
@@ -1133,9 +1108,8 @@
 
 	if ((attr_mask & IB_QP_PORT) &&
 	    (attr->port_num == 0 || attr->port_num > hr_dev->caps.num_ports)) {
-		ibdev_err(&hr_dev->ib_dev,
-			"attr port_num invalid.attr->port_num=%d\n",
-			attr->port_num);
+		ibdev_err(&hr_dev->ib_dev, "invalid attr, port_num = %u.\n",
+			  attr->port_num);
 		return -EINVAL;
 	}
 
@@ -1143,8 +1117,8 @@
 		p = attr_mask & IB_QP_PORT ? (attr->port_num - 1) : hr_qp->port;
 		if (attr->pkey_index >= hr_dev->caps.pkey_table_len[p]) {
 			ibdev_err(&hr_dev->ib_dev,
-				"attr pkey_index invalid.attr->pkey_index=%d\n",
-				attr->pkey_index);
+				  "invalid attr, pkey_index = %u.\n",
+				  attr->pkey_index);
 			return -EINVAL;
 		}
 	}
@@ -1152,16 +1126,16 @@
 	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
 	    attr->max_rd_atomic > hr_dev->caps.max_qp_init_rdma) {
 		ibdev_err(&hr_dev->ib_dev,
-			"attr max_rd_atomic invalid.attr->max_rd_atomic=%d\n",
-			attr->max_rd_atomic);
+			  "invalid attr, max_rd_atomic = %u.\n",
+			  attr->max_rd_atomic);
 		return -EINVAL;
 	}
 
 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
 	    attr->max_dest_rd_atomic > hr_dev->caps.max_qp_dest_rdma) {
 		ibdev_err(&hr_dev->ib_dev,
-			"attr max_dest_rd_atomic invalid.attr->max_dest_rd_atomic=%d\n",
-			attr->max_dest_rd_atomic);
+			  "invalid attr, max_dest_rd_atomic = %u.\n",
+			  attr->max_dest_rd_atomic);
 		return -EINVAL;
 	}
 
@@ -1189,10 +1163,10 @@
 
 	if (ibqp->uobject &&
 	    (attr_mask & IB_QP_STATE) && new_state == IB_QPS_ERR) {
-		if (hr_qp->sdb_en == 1) {
+		if (hr_qp->en_flags & HNS_ROCE_QP_CAP_SQ_RECORD_DB) {
 			hr_qp->sq.head = *(int *)(hr_qp->sdb.virt_addr);
 
-			if (hr_qp->rdb_en == 1)
+			if (hr_qp->en_flags & HNS_ROCE_QP_CAP_RQ_RECORD_DB)
 				hr_qp->rq.head = *(int *)(hr_qp->rdb.virt_addr);
 		} else {
 			ibdev_warn(&hr_dev->ib_dev,
@@ -1212,11 +1186,10 @@
 		goto out;
 
 	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
-		if (hr_dev->caps.min_wqes) {
+		if (hr_dev->hw_rev == HNS_ROCE_HW_VER1) {
 			ret = -EPERM;
 			ibdev_err(&hr_dev->ib_dev,
-				"cur_state=%d new_state=%d\n", cur_state,
-				new_state);
+				  "RST2RST state is not supported\n");
 		} else {
 			ret = 0;
 		}
@@ -1236,7 +1209,16 @@
 void hns_roce_lock_cqs(struct hns_roce_cq *send_cq, struct hns_roce_cq *recv_cq)
 		       __acquires(&send_cq->lock) __acquires(&recv_cq->lock)
 {
-	if (send_cq == recv_cq) {
+	if (unlikely(send_cq == NULL && recv_cq == NULL)) {
+		__acquire(&send_cq->lock);
+		__acquire(&recv_cq->lock);
+	} else if (unlikely(send_cq != NULL && recv_cq == NULL)) {
+		spin_lock_irq(&send_cq->lock);
+		__acquire(&recv_cq->lock);
+	} else if (unlikely(send_cq == NULL && recv_cq != NULL)) {
+		spin_lock_irq(&recv_cq->lock);
+		__acquire(&send_cq->lock);
+	} else if (send_cq == recv_cq) {
 		spin_lock_irq(&send_cq->lock);
 		__acquire(&recv_cq->lock);
 	} else if (send_cq->cqn < recv_cq->cqn) {
@@ -1252,7 +1234,16 @@
 			 struct hns_roce_cq *recv_cq) __releases(&send_cq->lock)
 			 __releases(&recv_cq->lock)
 {
-	if (send_cq == recv_cq) {
+	if (unlikely(send_cq == NULL && recv_cq == NULL)) {
+		__release(&recv_cq->lock);
+		__release(&send_cq->lock);
+	} else if (unlikely(send_cq != NULL && recv_cq == NULL)) {
+		__release(&recv_cq->lock);
+		spin_unlock(&send_cq->lock);
+	} else if (unlikely(send_cq == NULL && recv_cq != NULL)) {
+		__release(&send_cq->lock);
+		spin_unlock(&recv_cq->lock);
+	} else if (send_cq == recv_cq) {
 		__release(&recv_cq->lock);
 		spin_unlock_irq(&send_cq->lock);
 	} else if (send_cq->cqn < recv_cq->cqn) {
@@ -1264,26 +1255,24 @@
 	}
 }
 
-static void *get_wqe(struct hns_roce_qp *hr_qp, int offset)
+static inline void *get_wqe(struct hns_roce_qp *hr_qp, int offset)
 {
-
-	return hns_roce_buf_offset(&hr_qp->hr_buf, offset);
+	return hns_roce_buf_offset(hr_qp->mtr.kmem, offset);
 }
 
-void *get_recv_wqe(struct hns_roce_qp *hr_qp, int n)
+void *hns_roce_get_recv_wqe(struct hns_roce_qp *hr_qp, int n)
 {
 	return get_wqe(hr_qp, hr_qp->rq.offset + (n << hr_qp->rq.wqe_shift));
 }
 
-void *get_send_wqe(struct hns_roce_qp *hr_qp, int n)
+void *hns_roce_get_send_wqe(struct hns_roce_qp *hr_qp, int n)
 {
 	return get_wqe(hr_qp, hr_qp->sq.offset + (n << hr_qp->sq.wqe_shift));
 }
 
-void *get_send_extend_sge(struct hns_roce_qp *hr_qp, int n)
+void *hns_roce_get_extend_sge(struct hns_roce_qp *hr_qp, int n)
 {
-	return hns_roce_buf_offset(&hr_qp->hr_buf, hr_qp->sge.offset +
-					(n << hr_qp->sge.sge_shift));
+	return get_wqe(hr_qp, hr_qp->sge.offset + (n << hr_qp->sge.sge_shift));
 }
 
 bool hns_roce_wq_overflow(struct hns_roce_wq *hr_wq, int nreq,
@@ -1293,7 +1282,7 @@
 	u32 cur;
 
 	cur = hr_wq->head - hr_wq->tail;
-	if (likely(cur + nreq < hr_wq->max_post))
+	if (likely(cur + nreq < hr_wq->wqe_cnt))
 		return false;
 
 	hr_cq = to_hr_cq(ib_cq);
@@ -1301,7 +1290,7 @@
 	cur = hr_wq->head - hr_wq->tail;
 	spin_unlock(&hr_cq->lock);
 
-	return cur + nreq >= hr_wq->max_post;
+	return cur + nreq >= hr_wq->wqe_cnt;
 }
 
 int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev)
diff --git a/drivers/infiniband/hw/hns/hns_roce_restrack.c b/drivers/infiniband/hw/hns/hns_roce_restrack.c
index 39c0821..259444c 100644
--- a/drivers/infiniband/hw/hns/hns_roce_restrack.c
+++ b/drivers/infiniband/hw/hns/hns_roce_restrack.c
@@ -76,10 +76,9 @@
 	return -EMSGSIZE;
 }
 
-static int hns_roce_fill_res_cq_entry(struct sk_buff *msg,
-				      struct rdma_restrack_entry *res)
+int hns_roce_fill_res_cq_entry(struct sk_buff *msg,
+			       struct ib_cq *ib_cq)
 {
-	struct ib_cq *ib_cq = container_of(res, struct ib_cq, res);
 	struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
 	struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
 	struct hns_roce_v2_cq_context *context;
@@ -95,7 +94,7 @@
 
 	ret = hr_dev->dfx->query_cqc_info(hr_dev, hr_cq->cqn, (int *)context);
 	if (ret)
-		return -EINVAL;
+		goto err;
 
 	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER);
 	if (!table_attr) {
@@ -119,12 +118,3 @@
 	kfree(context);
 	return ret;
 }
-
-int hns_roce_fill_res_entry(struct sk_buff *msg,
-			    struct rdma_restrack_entry *res)
-{
-	if (res->type == RDMA_RESTRACK_CQ)
-		return hns_roce_fill_res_cq_entry(msg, res);
-
-	return 0;
-}
diff --git a/drivers/infiniband/hw/hns/hns_roce_srq.c b/drivers/infiniband/hw/hns/hns_roce_srq.c
index 108667a..08df97e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_srq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_srq.c
@@ -59,74 +59,77 @@
 	}
 }
 
-static int hns_roce_sw2hw_srq(struct hns_roce_dev *dev,
-			      struct hns_roce_cmd_mailbox *mailbox,
-			      unsigned long srq_num)
+static int hns_roce_hw_create_srq(struct hns_roce_dev *dev,
+				  struct hns_roce_cmd_mailbox *mailbox,
+				  unsigned long srq_num)
 {
 	return hns_roce_cmd_mbox(dev, mailbox->dma, 0, srq_num, 0,
-				 HNS_ROCE_CMD_SW2HW_SRQ,
+				 HNS_ROCE_CMD_CREATE_SRQ,
 				 HNS_ROCE_CMD_TIMEOUT_MSECS);
 }
 
-static int hns_roce_hw2sw_srq(struct hns_roce_dev *dev,
-			     struct hns_roce_cmd_mailbox *mailbox,
-			     unsigned long srq_num)
+static int hns_roce_hw_destroy_srq(struct hns_roce_dev *dev,
+				   struct hns_roce_cmd_mailbox *mailbox,
+				   unsigned long srq_num)
 {
 	return hns_roce_cmd_mbox(dev, 0, mailbox ? mailbox->dma : 0, srq_num,
-				 mailbox ? 0 : 1, HNS_ROCE_CMD_HW2SW_SRQ,
+				 mailbox ? 0 : 1, HNS_ROCE_CMD_DESTROY_SRQ,
 				 HNS_ROCE_CMD_TIMEOUT_MSECS);
 }
 
-static int hns_roce_srq_alloc(struct hns_roce_dev *hr_dev, u32 pdn, u32 cqn,
-			      u16 xrcd, struct hns_roce_mtt *hr_mtt,
-			      u64 db_rec_addr, struct hns_roce_srq *srq)
+static int alloc_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq,
+		      u32 pdn, u32 cqn, u16 xrcd, u64 db_rec_addr)
 {
 	struct hns_roce_srq_table *srq_table = &hr_dev->srq_table;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
 	struct hns_roce_cmd_mailbox *mailbox;
-	dma_addr_t dma_handle_wqe;
-	dma_addr_t dma_handle_idx;
-	u64 *mtts_wqe;
-	u64 *mtts_idx;
+	u64 mtts_wqe[MTT_MIN_COUNT] = { 0 };
+	u64 mtts_idx[MTT_MIN_COUNT] = { 0 };
+	dma_addr_t dma_handle_wqe = 0;
+	dma_addr_t dma_handle_idx = 0;
 	int ret;
 
 	/* Get the physical address of srq buf */
-	mtts_wqe = hns_roce_table_find(hr_dev,
-				       &hr_dev->mr_table.mtt_srqwqe_table,
-				       srq->mtt.first_seg,
-				       &dma_handle_wqe);
-	if (!mtts_wqe) {
-		dev_err(hr_dev->dev,
-			"SRQ alloc.Failed to find srq buf addr.\n");
-		return -EINVAL;
+	ret = hns_roce_mtr_find(hr_dev, &srq->buf_mtr, 0, mtts_wqe,
+				ARRAY_SIZE(mtts_wqe), &dma_handle_wqe);
+	if (ret < 1) {
+		ibdev_err(ibdev, "failed to find mtr for SRQ WQE, ret = %d.\n",
+			  ret);
+		return -ENOBUFS;
 	}
 
 	/* Get physical address of idx que buf */
-	mtts_idx = hns_roce_table_find(hr_dev, &hr_dev->mr_table.mtt_idx_table,
-				       srq->idx_que.mtt.first_seg,
-				       &dma_handle_idx);
-	if (!mtts_idx) {
-		dev_err(hr_dev->dev,
-			"SRQ alloc.Failed to find idx que buf addr.\n");
-		return -EINVAL;
+	ret = hns_roce_mtr_find(hr_dev, &srq->idx_que.mtr, 0, mtts_idx,
+				ARRAY_SIZE(mtts_idx), &dma_handle_idx);
+	if (ret < 1) {
+		ibdev_err(ibdev, "failed to find mtr for SRQ idx, ret = %d.\n",
+			  ret);
+		return -ENOBUFS;
 	}
 
 	ret = hns_roce_bitmap_alloc(&srq_table->bitmap, &srq->srqn);
-	if (ret == -1) {
-		dev_err(hr_dev->dev, "SRQ alloc.Failed to alloc index.\n");
+	if (ret) {
+		ibdev_err(ibdev,
+			  "failed to alloc SRQ number, ret = %d.\n", ret);
 		return -ENOMEM;
 	}
 
 	ret = hns_roce_table_get(hr_dev, &srq_table->table, srq->srqn);
-	if (ret)
+	if (ret) {
+		ibdev_err(ibdev, "failed to get SRQC table, ret = %d.\n", ret);
 		goto err_out;
+	}
 
 	ret = xa_err(xa_store(&srq_table->xa, srq->srqn, srq, GFP_KERNEL));
-	if (ret)
+	if (ret) {
+		ibdev_err(ibdev, "failed to store SRQC, ret = %d.\n", ret);
 		goto err_put;
+	}
 
 	mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
-	if (IS_ERR(mailbox)) {
-		ret = PTR_ERR(mailbox);
+	if (IS_ERR_OR_NULL(mailbox)) {
+		ret = -ENOMEM;
+		ibdev_err(ibdev, "failed to alloc mailbox for SRQC.\n");
 		goto err_xa;
 	}
 
@@ -134,10 +137,12 @@
 			       mtts_wqe, mtts_idx, dma_handle_wqe,
 			       dma_handle_idx);
 
-	ret = hns_roce_sw2hw_srq(hr_dev, mailbox, srq->srqn);
+	ret = hns_roce_hw_create_srq(hr_dev, mailbox, srq->srqn);
 	hns_roce_free_cmd_mailbox(hr_dev, mailbox);
-	if (ret)
+	if (ret) {
+		ibdev_err(ibdev, "failed to config SRQC, ret = %d.\n", ret);
 		goto err_xa;
+	}
 
 	atomic_set(&srq->refcount, 1);
 	init_completion(&srq->free);
@@ -154,15 +159,14 @@
 	return ret;
 }
 
-static void hns_roce_srq_free(struct hns_roce_dev *hr_dev,
-			      struct hns_roce_srq *srq)
+static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq)
 {
 	struct hns_roce_srq_table *srq_table = &hr_dev->srq_table;
 	int ret;
 
-	ret = hns_roce_hw2sw_srq(hr_dev, NULL, srq->srqn);
+	ret = hns_roce_hw_destroy_srq(hr_dev, NULL, srq->srqn);
 	if (ret)
-		dev_err(hr_dev->dev, "HW2SW_SRQ failed (%d) for CQN %06lx\n",
+		dev_err(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n",
 			ret, srq->srqn);
 
 	xa_erase(&srq_table->xa, srq->srqn);
@@ -175,289 +179,207 @@
 	hns_roce_bitmap_free(&srq_table->bitmap, srq->srqn, BITMAP_NO_RR);
 }
 
-static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata,
-			   int srq_buf_size)
+static int alloc_srq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq,
+			 struct ib_udata *udata, unsigned long addr)
 {
-	struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device);
-	struct hns_roce_ib_create_srq  ucmd;
-	struct hns_roce_buf *buf;
-	int ret;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_buf_attr buf_attr = {};
+	int err;
 
-	if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
-		return -EFAULT;
+	srq->wqe_shift = ilog2(roundup_pow_of_two(max(HNS_ROCE_SGE_SIZE,
+						      HNS_ROCE_SGE_SIZE *
+						      srq->max_gs)));
 
-	srq->umem = ib_umem_get(udata, ucmd.buf_addr, srq_buf_size, 0, 0);
-	if (IS_ERR(srq->umem))
-		return PTR_ERR(srq->umem);
+	buf_attr.page_shift = hr_dev->caps.srqwqe_buf_pg_sz + HNS_HW_PAGE_SHIFT;
+	buf_attr.region[0].size = to_hr_hem_entries_size(srq->wqe_cnt,
+							 srq->wqe_shift);
+	buf_attr.region[0].hopnum = hr_dev->caps.srqwqe_hop_num;
+	buf_attr.region_count = 1;
+	buf_attr.fixed_page = true;
 
-	buf = &srq->buf;
-	buf->npages = (ib_umem_page_count(srq->umem) +
-		       (1 << hr_dev->caps.srqwqe_buf_pg_sz) - 1) /
-		      (1 << hr_dev->caps.srqwqe_buf_pg_sz);
-	buf->page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz;
-	ret = hns_roce_mtt_init(hr_dev, buf->npages, buf->page_shift,
-				&srq->mtt);
-	if (ret)
-		goto err_user_buf;
+	err = hns_roce_mtr_create(hr_dev, &srq->buf_mtr, &buf_attr,
+				  hr_dev->caps.srqwqe_ba_pg_sz +
+				  HNS_HW_PAGE_SHIFT, udata, addr);
+	if (err)
+		ibdev_err(ibdev,
+			  "failed to alloc SRQ buf mtr, ret = %d.\n", err);
 
-	ret = hns_roce_ib_umem_write_mtt(hr_dev, &srq->mtt, srq->umem);
-	if (ret)
-		goto err_user_srq_mtt;
+	return err;
+}
 
-	/* config index queue BA */
-	srq->idx_que.umem = ib_umem_get(udata, ucmd.que_addr,
-					srq->idx_que.buf_size, 0, 0);
-	if (IS_ERR(srq->idx_que.umem)) {
-		dev_err(hr_dev->dev, "ib_umem_get error for index queue\n");
-		ret = PTR_ERR(srq->idx_que.umem);
-		goto err_user_srq_mtt;
+static void free_srq_buf(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq)
+{
+	hns_roce_mtr_destroy(hr_dev, &srq->buf_mtr);
+}
+
+static int alloc_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq,
+			 struct ib_udata *udata, unsigned long addr)
+{
+	struct hns_roce_idx_que *idx_que = &srq->idx_que;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_buf_attr buf_attr = {};
+	int err;
+
+	srq->idx_que.entry_shift = ilog2(HNS_ROCE_IDX_QUE_ENTRY_SZ);
+
+	buf_attr.page_shift = hr_dev->caps.idx_buf_pg_sz + HNS_HW_PAGE_SHIFT;
+	buf_attr.region[0].size = to_hr_hem_entries_size(srq->wqe_cnt,
+					srq->idx_que.entry_shift);
+	buf_attr.region[0].hopnum = hr_dev->caps.idx_hop_num;
+	buf_attr.region_count = 1;
+	buf_attr.fixed_page = true;
+
+	err = hns_roce_mtr_create(hr_dev, &idx_que->mtr, &buf_attr,
+				  hr_dev->caps.idx_ba_pg_sz + HNS_HW_PAGE_SHIFT,
+				  udata, addr);
+	if (err) {
+		ibdev_err(ibdev,
+			  "failed to alloc SRQ idx mtr, ret = %d.\n", err);
+		return err;
 	}
 
-	buf = &srq->idx_que.idx_buf;
-	buf->npages = DIV_ROUND_UP(ib_umem_page_count(srq->idx_que.umem),
-				   1 << hr_dev->caps.idx_buf_pg_sz);
-	buf->page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz;
-	ret = hns_roce_mtt_init(hr_dev, buf->npages, buf->page_shift,
-				&srq->idx_que.mtt);
-	if (ret) {
-		dev_err(hr_dev->dev, "hns_roce_mtt_init error for idx que\n");
-		goto err_user_idx_mtt;
-	}
+	if (!udata) {
+		idx_que->bitmap = bitmap_zalloc(srq->wqe_cnt, GFP_KERNEL);
+		if (!idx_que->bitmap) {
+			ibdev_err(ibdev, "failed to alloc SRQ idx bitmap.\n");
+			err = -ENOMEM;
+			goto err_idx_mtr;
+		}
 
-	ret = hns_roce_ib_umem_write_mtt(hr_dev, &srq->idx_que.mtt,
-					 srq->idx_que.umem);
-	if (ret) {
-		dev_err(hr_dev->dev,
-			"hns_roce_ib_umem_write_mtt error for idx que\n");
-		goto err_user_idx_buf;
 	}
 
 	return 0;
+err_idx_mtr:
+	hns_roce_mtr_destroy(hr_dev, &idx_que->mtr);
 
-err_user_idx_buf:
-	hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
-
-err_user_idx_mtt:
-	ib_umem_release(srq->idx_que.umem);
-
-err_user_srq_mtt:
-	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
-
-err_user_buf:
-	ib_umem_release(srq->umem);
-
-	return ret;
+	return err;
 }
 
-static int hns_roce_create_idx_que(struct ib_pd *pd, struct hns_roce_srq *srq,
-				   u32 page_shift)
+static void free_srq_idx(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq)
 {
-	struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
 	struct hns_roce_idx_que *idx_que = &srq->idx_que;
 
-	idx_que->bitmap = bitmap_zalloc(srq->max, GFP_KERNEL);
-	if (!idx_que->bitmap)
-		return -ENOMEM;
-
-	idx_que->buf_size = srq->idx_que.buf_size;
-
-	if (hns_roce_buf_alloc(hr_dev, idx_que->buf_size, (1 << page_shift) * 2,
-			       &idx_que->idx_buf, page_shift)) {
-		bitmap_free(idx_que->bitmap);
-		return -ENOMEM;
-	}
-
-	return 0;
+	bitmap_free(idx_que->bitmap);
+	idx_que->bitmap = NULL;
+	hns_roce_mtr_destroy(hr_dev, &idx_que->mtr);
 }
 
-static int create_kernel_srq(struct hns_roce_srq *srq, int srq_buf_size)
+static int alloc_srq_wrid(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq)
 {
-	struct hns_roce_dev *hr_dev = to_hr_dev(srq->ibsrq.device);
-	u32 page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz;
-	int ret;
-
-	if (hns_roce_buf_alloc(hr_dev, srq_buf_size, (1 << page_shift) * 2,
-			       &srq->buf, page_shift))
-		return -ENOMEM;
-
 	srq->head = 0;
-	srq->tail = srq->max - 1;
-
-	ret = hns_roce_mtt_init(hr_dev, srq->buf.npages, srq->buf.page_shift,
-				&srq->mtt);
-	if (ret)
-		goto err_kernel_buf;
-
-	ret = hns_roce_buf_write_mtt(hr_dev, &srq->mtt, &srq->buf);
-	if (ret)
-		goto err_kernel_srq_mtt;
-
-	page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz;
-	ret = hns_roce_create_idx_que(srq->ibsrq.pd, srq, page_shift);
-	if (ret) {
-		dev_err(hr_dev->dev, "Create idx queue fail(%d)!\n", ret);
-		goto err_kernel_srq_mtt;
-	}
-
-	/* Init mtt table for idx_que */
-	ret = hns_roce_mtt_init(hr_dev, srq->idx_que.idx_buf.npages,
-				srq->idx_que.idx_buf.page_shift,
-				&srq->idx_que.mtt);
-	if (ret)
-		goto err_kernel_create_idx;
-
-	/* Write buffer address into the mtt table */
-	ret = hns_roce_buf_write_mtt(hr_dev, &srq->idx_que.mtt,
-				     &srq->idx_que.idx_buf);
-	if (ret)
-		goto err_kernel_idx_buf;
-
-	srq->wrid = kvmalloc_array(srq->max, sizeof(u64), GFP_KERNEL);
-	if (!srq->wrid) {
-		ret = -ENOMEM;
-		goto err_kernel_idx_buf;
-	}
+	srq->tail = srq->wqe_cnt - 1;
+	srq->wrid = kvmalloc_array(srq->wqe_cnt, sizeof(u64), GFP_KERNEL);
+	if (!srq->wrid)
+		return -ENOMEM;
 
 	return 0;
-
-err_kernel_idx_buf:
-	hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
-
-err_kernel_create_idx:
-	hns_roce_buf_free(hr_dev, srq->idx_que.buf_size,
-			  &srq->idx_que.idx_buf);
-	kfree(srq->idx_que.bitmap);
-
-err_kernel_srq_mtt:
-	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
-
-err_kernel_buf:
-	hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf);
-
-	return ret;
 }
 
-static void destroy_user_srq(struct hns_roce_dev *hr_dev,
-			     struct hns_roce_srq *srq)
-{
-	hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
-	ib_umem_release(srq->idx_que.umem);
-	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
-	ib_umem_release(srq->umem);
-}
-
-static void destroy_kernel_srq(struct hns_roce_dev *hr_dev,
-			       struct hns_roce_srq *srq, int srq_buf_size)
+static void free_srq_wrid(struct hns_roce_srq *srq)
 {
 	kvfree(srq->wrid);
-	hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
-	hns_roce_buf_free(hr_dev, srq->idx_que.buf_size, &srq->idx_que.idx_buf);
-	kfree(srq->idx_que.bitmap);
-	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
-	hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf);
+	srq->wrid = NULL;
 }
 
 int hns_roce_create_srq(struct ib_srq *ib_srq,
-			struct ib_srq_init_attr *srq_init_attr,
+			struct ib_srq_init_attr *init_attr,
 			struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ib_srq->device);
 	struct hns_roce_ib_create_srq_resp resp = {};
 	struct hns_roce_srq *srq = to_hr_srq(ib_srq);
-	int srq_desc_size;
-	int srq_buf_size;
-	int ret = 0;
+	struct ib_device *ibdev = &hr_dev->ib_dev;
+	struct hns_roce_ib_create_srq ucmd = {};
+	int ret;
 	u32 cqn;
 
 	/* Check the actual SRQ wqe and SRQ sge num */
-	if (srq_init_attr->attr.max_wr >= hr_dev->caps.max_srq_wrs ||
-	    srq_init_attr->attr.max_sge > hr_dev->caps.max_srq_sges)
+	if (init_attr->attr.max_wr >= hr_dev->caps.max_srq_wrs ||
+	    init_attr->attr.max_sge > hr_dev->caps.max_srq_sges)
 		return -EINVAL;
 
 	mutex_init(&srq->mutex);
 	spin_lock_init(&srq->lock);
 
-	srq->max = roundup_pow_of_two(srq_init_attr->attr.max_wr + 1);
-	srq->max_gs = srq_init_attr->attr.max_sge;
-
-	srq_desc_size = roundup_pow_of_two(max(16, 16 * srq->max_gs));
-
-	srq->wqe_shift = ilog2(srq_desc_size);
-
-	srq_buf_size = srq->max * srq_desc_size;
-
-	srq->idx_que.entry_sz = HNS_ROCE_IDX_QUE_ENTRY_SZ;
-	srq->idx_que.buf_size = srq->max * srq->idx_que.entry_sz;
-	srq->mtt.mtt_type = MTT_TYPE_SRQWQE;
-	srq->idx_que.mtt.mtt_type = MTT_TYPE_IDX;
+	srq->wqe_cnt = roundup_pow_of_two(init_attr->attr.max_wr + 1);
+	srq->max_gs = init_attr->attr.max_sge;
 
 	if (udata) {
-		ret = create_user_srq(srq, udata, srq_buf_size);
+		ret = ib_copy_from_udata(&ucmd, udata,
+					 min(udata->inlen, sizeof(ucmd)));
 		if (ret) {
-			dev_err(hr_dev->dev, "Create user srq failed\n");
-			goto err_srq;
-		}
-	} else {
-		ret = create_kernel_srq(srq, srq_buf_size);
-		if (ret) {
-			dev_err(hr_dev->dev, "Create kernel srq failed\n");
-			goto err_srq;
+			ibdev_err(ibdev, "failed to copy SRQ udata, ret = %d.\n",
+				  ret);
+			return ret;
 		}
 	}
 
-	cqn = ib_srq_has_cq(srq_init_attr->srq_type) ?
-	      to_hr_cq(srq_init_attr->ext.cq)->cqn : 0;
+	ret = alloc_srq_buf(hr_dev, srq, udata, ucmd.buf_addr);
+	if (ret) {
+		ibdev_err(ibdev,
+			  "failed to alloc SRQ buffer, ret = %d.\n", ret);
+		return ret;
+	}
 
+	ret = alloc_srq_idx(hr_dev, srq, udata, ucmd.que_addr);
+	if (ret) {
+		ibdev_err(ibdev, "failed to alloc SRQ idx, ret = %d.\n", ret);
+		goto err_buf_alloc;
+	}
+
+	if (!udata) {
+		ret = alloc_srq_wrid(hr_dev, srq);
+		if (ret) {
+			ibdev_err(ibdev, "failed to alloc SRQ wrid, ret = %d.\n",
+				  ret);
+			goto err_idx_alloc;
+		}
+	}
+
+	cqn = ib_srq_has_cq(init_attr->srq_type) ?
+	      to_hr_cq(init_attr->ext.cq)->cqn : 0;
 	srq->db_reg_l = hr_dev->reg_base + SRQ_DB_REG;
 
-	ret = hns_roce_srq_alloc(hr_dev, to_hr_pd(ib_srq->pd)->pdn, cqn, 0,
-				 &srq->mtt, 0, srq);
-	if (ret)
-		goto err_wrid;
+	ret = alloc_srqc(hr_dev, srq, to_hr_pd(ib_srq->pd)->pdn, cqn, 0, 0);
+	if (ret) {
+		ibdev_err(ibdev,
+			  "failed to alloc SRQ context, ret = %d.\n", ret);
+		goto err_wrid_alloc;
+	}
 
 	srq->event = hns_roce_ib_srq_event;
 	resp.srqn = srq->srqn;
 
 	if (udata) {
-		if (ib_copy_to_udata(udata, &resp,
-				     min(udata->outlen, sizeof(resp)))) {
-			ret = -EFAULT;
+		ret = ib_copy_to_udata(udata, &resp,
+				       min(udata->outlen, sizeof(resp)));
+		if (ret)
 			goto err_srqc_alloc;
-		}
 	}
 
 	return 0;
 
 err_srqc_alloc:
-	hns_roce_srq_free(hr_dev, srq);
-
-err_wrid:
-	if (udata)
-		destroy_user_srq(hr_dev, srq);
-	else
-		destroy_kernel_srq(hr_dev, srq, srq_buf_size);
-
-err_srq:
+	free_srqc(hr_dev, srq);
+err_wrid_alloc:
+	free_srq_wrid(srq);
+err_idx_alloc:
+	free_srq_idx(hr_dev, srq);
+err_buf_alloc:
+	free_srq_buf(hr_dev, srq);
 	return ret;
 }
 
-void hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
+int hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
 	struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
 	struct hns_roce_srq *srq = to_hr_srq(ibsrq);
 
-	hns_roce_srq_free(hr_dev, srq);
-	hns_roce_mtt_cleanup(hr_dev, &srq->mtt);
-
-	if (udata) {
-		hns_roce_mtt_cleanup(hr_dev, &srq->idx_que.mtt);
-	} else {
-		kvfree(srq->wrid);
-		hns_roce_buf_free(hr_dev, srq->max << srq->wqe_shift,
-				  &srq->buf);
-	}
-	ib_umem_release(srq->idx_que.umem);
-	ib_umem_release(srq->umem);
+	free_srqc(hr_dev, srq);
+	free_srq_idx(hr_dev, srq);
+	free_srq_wrid(srq);
+	free_srq_buf(hr_dev, srq);
+	return 0;
 }
 
 int hns_roce_init_srq_table(struct hns_roce_dev *hr_dev)
diff --git a/drivers/infiniband/hw/i40iw/Kconfig b/drivers/infiniband/hw/i40iw/Kconfig
index e4b45f4..7476f3b 100644
--- a/drivers/infiniband/hw/i40iw/Kconfig
+++ b/drivers/infiniband/hw/i40iw/Kconfig
@@ -5,5 +5,5 @@
 	depends on IPV6 || !IPV6
 	depends on PCI
 	select GENERIC_ALLOCATOR
-	---help---
+	help
 	Intel(R) Ethernet X722 iWARP Driver
diff --git a/drivers/infiniband/hw/i40iw/Makefile b/drivers/infiniband/hw/i40iw/Makefile
index 8942f82..34da9eb 100644
--- a/drivers/infiniband/hw/i40iw/Makefile
+++ b/drivers/infiniband/hw/i40iw/Makefile
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
-ccflags-y :=  -I $(srctree)/drivers/net/ethernet/intel/i40e
 
 obj-$(CONFIG_INFINIBAND_I40IW) += i40iw.o
 
diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h
index 6d6719f..832b80d 100644
--- a/drivers/infiniband/hw/i40iw/i40iw.h
+++ b/drivers/infiniband/hw/i40iw/i40iw.h
@@ -45,6 +45,7 @@
 #include <linux/slab.h>
 #include <linux/io.h>
 #include <linux/crc32c.h>
+#include <linux/net/intel/i40e_client.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_pack.h>
@@ -57,7 +58,6 @@
 #include "i40iw_d.h"
 #include "i40iw_hmc.h"
 
-#include <i40e_client.h>
 #include "i40iw_type.h"
 #include "i40iw_p.h"
 #include <rdma/i40iw-abi.h>
@@ -67,7 +67,7 @@
 #include "i40iw_user.h"
 #include "i40iw_puda.h"
 
-#define I40IW_FW_VERSION  2
+#define I40IW_FW_VER_DEFAULT 2
 #define I40IW_HW_VERSION  2
 
 #define I40IW_ARP_ADD     1
@@ -326,6 +326,26 @@
 };
 
 /**
+ * i40iw_fw_major_ver - get firmware major version
+ * @dev: iwarp device
+ **/
+static inline u64 i40iw_fw_major_ver(struct i40iw_sc_dev *dev)
+{
+	return RS_64(dev->feature_info[I40IW_FEATURE_FW_INFO],
+		     I40IW_FW_VER_MAJOR);
+}
+
+/**
+ * i40iw_fw_minor_ver - get firmware minor version
+ * @dev: iwarp device
+ **/
+static inline u64 i40iw_fw_minor_ver(struct i40iw_sc_dev *dev)
+{
+	return RS_64(dev->feature_info[I40IW_FEATURE_FW_INFO],
+		     I40IW_FW_VER_MINOR);
+}
+
+/**
  * to_iwdev - get device
  * @ibdev: ib device
  **/
@@ -362,15 +382,6 @@
 }
 
 /**
- * to_iwmr_from_ibfmr - get device memory region
- * @ibfmr: ib fmr
- **/
-static inline struct i40iw_mr *to_iwmr_from_ibfmr(struct ib_fmr *ibfmr)
-{
-	return container_of(ibfmr, struct i40iw_mr, ibfmr);
-}
-
-/**
  * to_iwmw - get device memory window
  * @ibmw: ib memory window
  **/
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c
index 56c1e9a..3053c34 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
@@ -2443,7 +2443,7 @@
 	case I40IW_CM_STATE_FIN_WAIT1:
 	case I40IW_CM_STATE_LAST_ACK:
 		cm_node->cm_id->rem_ref(cm_node->cm_id);
-		/* fall through */
+		fallthrough;
 	case I40IW_CM_STATE_TIME_WAIT:
 		cm_node->state = I40IW_CM_STATE_CLOSED;
 		i40iw_rem_ref_cm_node(cm_node);
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h
index 66dc1ba..6e43e4d 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h
@@ -85,7 +85,7 @@
 	u8 flags;
 	u8 rev;
 	__be16 priv_data_len;
-	u8 priv_data[0];
+	u8 priv_data[];
 };
 
 #define ietf_mpa_req_resp_frame ietf_mpa_frame
@@ -101,7 +101,7 @@
 	u8 rev;
 	__be16 priv_data_len;
 	struct ietf_rtr_msg rtr_msg;
-	u8 priv_data[0];
+	u8 priv_data[];
 };
 
 struct i40iw_cm_node;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
index 4d841a3..86d3f8a 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
@@ -1022,6 +1022,95 @@
 }
 
 /**
+ * i40iw_sc_query_rdma_features_done - poll cqp for query features done
+ * @cqp: struct for cqp hw
+ */
+static enum i40iw_status_code
+i40iw_sc_query_rdma_features_done(struct i40iw_sc_cqp *cqp)
+{
+	return i40iw_sc_poll_for_cqp_op_done(
+		cqp, I40IW_CQP_OP_QUERY_RDMA_FEATURES, NULL);
+}
+
+/**
+ * i40iw_sc_query_rdma_features - query rdma features
+ * @cqp: struct for cqp hw
+ * @feat_mem: holds PA for HW to use
+ * @scratch: u64 saved to be used during cqp completion
+ */
+static enum i40iw_status_code
+i40iw_sc_query_rdma_features(struct i40iw_sc_cqp *cqp,
+			     struct i40iw_dma_mem *feat_mem, u64 scratch)
+{
+	u64 *wqe;
+	u64 header;
+
+	wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+	if (!wqe)
+		return I40IW_ERR_RING_FULL;
+
+	set_64bit_val(wqe, 32, feat_mem->pa);
+
+	header = LS_64(I40IW_CQP_OP_QUERY_RDMA_FEATURES, I40IW_CQPSQ_OPCODE) |
+		 LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID) | feat_mem->size;
+
+	i40iw_insert_wqe_hdr(wqe, header);
+
+	i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "QUERY RDMA FEATURES WQE",
+			wqe, I40IW_CQP_WQE_SIZE * 8);
+
+	i40iw_sc_cqp_post_sq(cqp);
+
+	return 0;
+}
+
+/**
+ * i40iw_get_rdma_features - get RDMA features
+ * @dev - sc device struct
+ */
+enum i40iw_status_code i40iw_get_rdma_features(struct i40iw_sc_dev *dev)
+{
+	enum i40iw_status_code ret_code;
+	struct i40iw_dma_mem feat_buf;
+	u64 temp;
+	u16 byte_idx, feat_type, feat_cnt;
+
+	ret_code = i40iw_allocate_dma_mem(dev->hw, &feat_buf,
+					  I40IW_FEATURE_BUF_SIZE,
+					  I40IW_FEATURE_BUF_ALIGNMENT);
+
+	if (ret_code)
+		return I40IW_ERR_NO_MEMORY;
+
+	ret_code = i40iw_sc_query_rdma_features(dev->cqp, &feat_buf, 0);
+	if (!ret_code)
+		ret_code = i40iw_sc_query_rdma_features_done(dev->cqp);
+
+	if (ret_code)
+		goto exit;
+
+	get_64bit_val(feat_buf.va, 0, &temp);
+	feat_cnt = RS_64(temp, I40IW_FEATURE_CNT);
+	if (feat_cnt < I40IW_MAX_FEATURES) {
+		ret_code = I40IW_ERR_INVALID_FEAT_CNT;
+		goto exit;
+	} else if (feat_cnt > I40IW_MAX_FEATURES) {
+		i40iw_debug(dev, I40IW_DEBUG_CQP,
+			    "features buf size insufficient\n");
+	}
+
+	for (byte_idx = 0, feat_type = 0; feat_type < I40IW_MAX_FEATURES;
+	     feat_type++, byte_idx += 8) {
+		get_64bit_val((u64 *)feat_buf.va, byte_idx, &temp);
+		dev->feature_info[feat_type] = RS_64(temp, I40IW_FEATURE_INFO);
+	}
+exit:
+	i40iw_free_dma_mem(dev->hw, &feat_buf);
+
+	return ret_code;
+}
+
+/**
  * i40iw_sc_query_fpm_values_done - poll for cqp wqe completion for query fpm
  * @cqp: struct for cqp hw
  */
@@ -1875,7 +1964,6 @@
 		info->out_rdrsp = true;
 		break;
 	case I40IW_AE_SOURCE_RSVD:
-		/* fallthrough */
 	default:
 		break;
 	}
@@ -3673,14 +3761,14 @@
 					LS_64(1, I40IW_CQPSQ_UPESD_ENTRY_VALID)));
 
 		set_64bit_val(wqe, 56, info->entry[2].data);
-		/* fallthrough */
+		fallthrough;
 	case 2:
 		set_64bit_val(wqe, 32,
 			      (LS_64(info->entry[1].cmd, I40IW_CQPSQ_UPESD_SDCMD) |
 					LS_64(1, I40IW_CQPSQ_UPESD_ENTRY_VALID)));
 
 		set_64bit_val(wqe, 40, info->entry[1].data);
-		/* fallthrough */
+		fallthrough;
 	case 1:
 		set_64bit_val(wqe, 0,
 			      LS_64(info->entry[0].cmd, I40IW_CQPSQ_UPESD_SDCMD));
@@ -4265,6 +4353,13 @@
 				true,
 				I40IW_CQP_WAIT_EVENT);
 		break;
+	case OP_QUERY_RDMA_FEATURES:
+		values_mem.pa = pcmdinfo->in.u.query_rdma_features.cap_pa;
+		values_mem.va = pcmdinfo->in.u.query_rdma_features.cap_va;
+		status = i40iw_sc_query_rdma_features(
+			pcmdinfo->in.u.query_rdma_features.cqp, &values_mem,
+			pcmdinfo->in.u.query_rdma_features.scratch);
+		break;
 	default:
 		status = I40IW_NOT_SUPPORTED;
 		break;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_d.h b/drivers/infiniband/hw/i40iw/i40iw_d.h
index 6ddaeec..e8367d6 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_d.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_d.h
@@ -403,7 +403,7 @@
 #define I40IW_CQP_OP_MANAGE_ARP                 0x0f
 #define I40IW_CQP_OP_MANAGE_VF_PBLE_BP          0x10
 #define I40IW_CQP_OP_MANAGE_PUSH_PAGES          0x11
-#define I40IW_CQP_OP_MANAGE_PE_TEAM             0x12
+#define I40IW_CQP_OP_QUERY_RDMA_FEATURES	0x12
 #define I40IW_CQP_OP_UPLOAD_CONTEXT             0x13
 #define I40IW_CQP_OP_ALLOCATE_LOC_MAC_IP_TABLE_ENTRY 0x14
 #define I40IW_CQP_OP_MANAGE_HMC_PM_FUNC_TABLE   0x15
@@ -431,6 +431,24 @@
 #define I40IW_CQP_OP_SHMC_PAGES_ALLOCATED       0x2b
 #define I40IW_CQP_OP_SET_HMC_RESOURCE_PROFILE   0x2d
 
+#define I40IW_FEATURE_BUF_SIZE                  (8 * I40IW_MAX_FEATURES)
+
+#define I40IW_FW_VER_MINOR_SHIFT        0
+#define I40IW_FW_VER_MINOR_MASK         \
+	(0xffffULL << I40IW_FW_VER_MINOR_SHIFT)
+
+#define I40IW_FW_VER_MAJOR_SHIFT        16
+#define I40IW_FW_VER_MAJOR_MASK	        \
+	(0xffffULL << I40IW_FW_VER_MAJOR_SHIFT)
+
+#define I40IW_FEATURE_INFO_SHIFT        0
+#define I40IW_FEATURE_INFO_MASK         \
+	(0xffffULL << I40IW_FEATURE_INFO_SHIFT)
+
+#define I40IW_FEATURE_CNT_SHIFT         32
+#define I40IW_FEATURE_CNT_MASK          \
+	(0xffffULL << I40IW_FEATURE_CNT_SHIFT)
+
 #define I40IW_UDA_QPSQ_NEXT_HEADER_SHIFT 16
 #define I40IW_UDA_QPSQ_NEXT_HEADER_MASK ((u64)0xff << I40IW_UDA_QPSQ_NEXT_HEADER_SHIFT)
 
@@ -1529,7 +1547,8 @@
 	I40IW_AEQ_ALIGNMENT =		0x100,
 	I40IW_CEQ_ALIGNMENT =		0x100,
 	I40IW_CQ0_ALIGNMENT =		0x100,
-	I40IW_SD_BUF_ALIGNMENT =	0x80
+	I40IW_SD_BUF_ALIGNMENT =	0x80,
+	I40IW_FEATURE_BUF_ALIGNMENT =	0x8
 };
 
 #define I40IW_WQE_SIZE_64	64
@@ -1732,6 +1751,7 @@
 #define OP_REQUESTED_COMMANDS                   31
 #define OP_COMPLETED_COMMANDS                   32
 #define OP_GEN_AE                               33
-#define OP_SIZE_CQP_STAT_ARRAY                  34
+#define OP_QUERY_RDMA_FEATURES                  34
+#define OP_SIZE_CQP_STAT_ARRAY			35
 
 #endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c
index a751250..56fdc16 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_hw.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c
@@ -353,7 +353,6 @@
 				i40iw_cm_disconn(iwqp);
 			break;
 		case I40IW_AE_BAD_CLOSE:
-			/* fall through */
 		case I40IW_AE_RESET_SENT:
 			i40iw_next_iw_state(iwqp, I40IW_QP_STATE_ERROR, 1, 0, 0);
 			i40iw_cm_disconn(iwqp);
@@ -413,7 +412,7 @@
 		case I40IW_AE_UDA_XMIT_DGRAM_TOO_LONG:
 		case I40IW_AE_UDA_XMIT_DGRAM_TOO_SHORT:
 			ctx_info->err_rq_idx_valid = false;
-			/* fall through */
+			fallthrough;
 		default:
 			if (!info->sq && ctx_info->err_rq_idx_valid) {
 				ctx_info->err_rq_idx = info->wqe_idx;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c
index f1b0290..584932d 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_main.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_main.c
@@ -188,9 +188,9 @@
  * i40iw_dpc - tasklet for aeq and ceq 0
  * @data: iwarp device
  */
-static void i40iw_dpc(unsigned long data)
+static void i40iw_dpc(struct tasklet_struct *t)
 {
-	struct i40iw_device *iwdev = (struct i40iw_device *)data;
+	struct i40iw_device *iwdev = from_tasklet(iwdev, t, dpc_tasklet);
 
 	if (iwdev->msix_shared)
 		i40iw_process_ceq(iwdev, iwdev->ceqlist);
@@ -202,9 +202,9 @@
  * i40iw_ceq_dpc - dpc handler for CEQ
  * @data: data points to CEQ
  */
-static void i40iw_ceq_dpc(unsigned long data)
+static void i40iw_ceq_dpc(struct tasklet_struct *t)
 {
-	struct i40iw_ceq *iwceq = (struct i40iw_ceq *)data;
+	struct i40iw_ceq *iwceq = from_tasklet(iwceq, t, dpc_tasklet);
 	struct i40iw_device *iwdev = iwceq->iwdev;
 
 	i40iw_process_ceq(iwdev, iwceq);
@@ -685,10 +685,10 @@
 	enum i40iw_status_code status;
 
 	if (iwdev->msix_shared && !ceq_id) {
-		tasklet_init(&iwdev->dpc_tasklet, i40iw_dpc, (unsigned long)iwdev);
+		tasklet_setup(&iwdev->dpc_tasklet, i40iw_dpc);
 		status = request_irq(msix_vec->irq, i40iw_irq_handler, 0, "AEQCEQ", iwdev);
 	} else {
-		tasklet_init(&iwceq->dpc_tasklet, i40iw_ceq_dpc, (unsigned long)iwceq);
+		tasklet_setup(&iwceq->dpc_tasklet, i40iw_ceq_dpc);
 		status = request_irq(msix_vec->irq, i40iw_ceq_handler, 0, "CEQ", iwceq);
 	}
 
@@ -837,7 +837,7 @@
 	u32 ret = 0;
 
 	if (!iwdev->msix_shared) {
-		tasklet_init(&iwdev->dpc_tasklet, i40iw_dpc, (unsigned long)iwdev);
+		tasklet_setup(&iwdev->dpc_tasklet, i40iw_dpc);
 		ret = request_irq(msix_vec->irq, i40iw_irq_handler, 0, "i40iw", iwdev);
 	}
 	if (ret) {
@@ -1208,22 +1208,19 @@
 {
 	struct net_device *dev;
 	struct in_device *idev;
-	bool got_lock = true;
 	u32 ip_addr;
 
-	if (!rtnl_trylock())
-		got_lock = false;
-
-	for_each_netdev(&init_net, dev) {
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, dev) {
 		if ((((rdma_vlan_dev_vlan_id(dev) < 0xFFFF) &&
 		      (rdma_vlan_dev_real_dev(dev) == iwdev->netdev)) ||
-		    (dev == iwdev->netdev)) && (dev->flags & IFF_UP)) {
+		    (dev == iwdev->netdev)) && (READ_ONCE(dev->flags) & IFF_UP)) {
 			const struct in_ifaddr *ifa;
 
-			idev = in_dev_get(dev);
+			idev = __in_dev_get_rcu(dev);
 			if (!idev)
 				continue;
-			in_dev_for_each_ifa_rtnl(ifa, idev) {
+			in_dev_for_each_ifa_rcu(ifa, idev) {
 				i40iw_debug(&iwdev->sc_dev, I40IW_DEBUG_CM,
 					    "IP=%pI4, vlan_id=%d, MAC=%pM\n", &ifa->ifa_address,
 					     rdma_vlan_dev_vlan_id(dev), dev->dev_addr);
@@ -1235,12 +1232,9 @@
 						       true,
 						       I40IW_ARP_ADD);
 			}
-
-			in_dev_put(idev);
 		}
 	}
-	if (got_lock)
-		rtnl_unlock();
+	rcu_read_unlock();
 }
 
 /**
@@ -1491,36 +1485,35 @@
 		iwdev->iw_status = 0;
 		i40iw_port_ibevent(iwdev);
 		i40iw_destroy_rdma_device(iwdev->iwibdev);
-		/* fallthrough */
+		fallthrough;
 	case IP_ADDR_REGISTERED:
 		if (!iwdev->reset)
 			i40iw_del_macip_entry(iwdev, (u8)iwdev->mac_ip_table_idx);
-		/* fallthrough */
-		/* fallthrough */
+		fallthrough;
 	case PBLE_CHUNK_MEM:
 		i40iw_destroy_pble_pool(dev, iwdev->pble_rsrc);
-		/* fallthrough */
+		fallthrough;
 	case CEQ_CREATED:
 		i40iw_dele_ceqs(iwdev);
-		/* fallthrough */
+		fallthrough;
 	case AEQ_CREATED:
 		i40iw_destroy_aeq(iwdev);
-		/* fallthrough */
+		fallthrough;
 	case IEQ_CREATED:
 		i40iw_puda_dele_resources(&iwdev->vsi, I40IW_PUDA_RSRC_TYPE_IEQ, iwdev->reset);
-		/* fallthrough */
+		fallthrough;
 	case ILQ_CREATED:
 		i40iw_puda_dele_resources(&iwdev->vsi, I40IW_PUDA_RSRC_TYPE_ILQ, iwdev->reset);
-		/* fallthrough */
+		fallthrough;
 	case CCQ_CREATED:
 		i40iw_destroy_ccq(iwdev);
-		/* fallthrough */
+		fallthrough;
 	case HMC_OBJS_CREATED:
 		i40iw_del_hmc_objects(dev, dev->hmc_info, true, iwdev->reset);
-		/* fallthrough */
+		fallthrough;
 	case CQP_CREATED:
 		i40iw_destroy_cqp(iwdev, true);
-		/* fallthrough */
+		fallthrough;
 	case INITIAL_STATE:
 		i40iw_cleanup_cm_core(&iwdev->cm_core);
 		if (iwdev->vsi.pestat) {
@@ -1530,7 +1523,6 @@
 		i40iw_del_init_mem(iwdev);
 		break;
 	case INVALID_STATE:
-		/* fallthrough */
 	default:
 		i40iw_pr_err("bad init_state = %d\n", iwdev->init_state);
 		break;
@@ -1577,7 +1569,7 @@
 	status = i40iw_save_msix_info(iwdev, ldev);
 	if (status)
 		return status;
-	iwdev->hw.dev_context = (void *)ldev->pcidev;
+	iwdev->hw.pcidev = ldev->pcidev;
 	iwdev->hw.hw_addr = ldev->hw_addr;
 	status = i40iw_allocate_dma_mem(&iwdev->hw,
 					&iwdev->obj_mem, 8192, 4096);
@@ -1684,6 +1676,12 @@
 		status = i40iw_setup_ceqs(iwdev, ldev);
 		if (status)
 			break;
+
+		status = i40iw_get_rdma_features(dev);
+		if (status)
+			dev->feature_info[I40IW_FEATURE_FW_INFO] =
+				I40IW_FW_VER_DEFAULT;
+
 		iwdev->init_state = CEQ_CREATED;
 		status = i40iw_initialize_hw_resources(iwdev);
 		if (status)
diff --git a/drivers/infiniband/hw/i40iw/i40iw_p.h b/drivers/infiniband/hw/i40iw/i40iw_p.h
index 11d3a2a..4c42956 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_p.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_p.h
@@ -105,6 +105,7 @@
 							   bool poll_registers);
 
 enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_count);
+enum i40iw_status_code i40iw_get_rdma_features(struct i40iw_sc_dev *dev);
 
 void free_sd_mem(struct i40iw_sc_dev *dev);
 
diff --git a/drivers/infiniband/hw/i40iw/i40iw_pble.c b/drivers/infiniband/hw/i40iw/i40iw_pble.c
index 3fafc54..ae7d227 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_pble.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_pble.c
@@ -167,7 +167,7 @@
  */
 static void i40iw_free_vmalloc_mem(struct i40iw_hw *hw, struct i40iw_chunk *chunk)
 {
-	struct pci_dev *pcidev = (struct pci_dev *)hw->dev_context;
+	struct pci_dev *pcidev = hw->pcidev;
 	int i;
 
 	if (!chunk->pg_cnt)
@@ -193,7 +193,7 @@
 						    struct i40iw_chunk *chunk,
 						    int pg_cnt)
 {
-	struct pci_dev *pcidev = (struct pci_dev *)hw->dev_context;
+	struct pci_dev *pcidev = hw->pcidev;
 	struct page *page;
 	u8 *addr;
 	u32 size;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c
index d9c7ae6..924be4b 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_puda.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c
@@ -814,13 +814,13 @@
 	switch (rsrc->completion) {
 	case PUDA_HASH_CRC_COMPLETE:
 		i40iw_free_hash_desc(rsrc->hash_desc);
-		/* fall through */
+		fallthrough;
 	case PUDA_QP_CREATED:
 		if (!reset)
 			i40iw_puda_free_qp(rsrc);
 
 		i40iw_free_dma_mem(dev->hw, &rsrc->qpmem);
-		/* fallthrough */
+		fallthrough;
 	case PUDA_CQ_CREATED:
 		if (!reset)
 			i40iw_puda_free_cq(rsrc);
diff --git a/drivers/infiniband/hw/i40iw/i40iw_status.h b/drivers/infiniband/hw/i40iw/i40iw_status.h
index f7013f1..d1c5855 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_status.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_status.h
@@ -95,7 +95,8 @@
 	I40IW_ERR_INVALID_MAC_ADDR = -65,
 	I40IW_ERR_BAD_STAG      = -66,
 	I40IW_ERR_CQ_COMPL_ERROR = -67,
-	I40IW_ERR_QUEUE_DESTROYED = -68
+	I40IW_ERR_QUEUE_DESTROYED = -68,
+	I40IW_ERR_INVALID_FEAT_CNT = -69
 
 };
 #endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_type.h b/drivers/infiniband/hw/i40iw/i40iw_type.h
index adc8d2e..c3babf3 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_type.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_type.h
@@ -73,6 +73,7 @@
 struct i40iw_priv_qp_ops;
 struct i40iw_priv_cq_ops;
 struct i40iw_hmc_ops;
+struct pci_dev;
 
 enum i40iw_page_size {
 	I40IW_PAGE_SIZE_4K,
@@ -234,6 +235,11 @@
 	I40IW_HW_STAT_INDEX_MAX_64
 };
 
+enum i40iw_feature_type {
+	I40IW_FEATURE_FW_INFO = 0,
+	I40IW_MAX_FEATURES
+};
+
 struct i40iw_dev_hw_stats_offsets {
 	u32 stats_offset_32[I40IW_HW_STAT_INDEX_MAX_32];
 	u32 stats_offset_64[I40IW_HW_STAT_INDEX_MAX_64];
@@ -256,7 +262,7 @@
 
 struct i40iw_hw {
 	u8 __iomem *hw_addr;
-	void *dev_context;
+	struct pci_dev *pcidev;
 	struct i40iw_hmc_info hmc;
 };
 
@@ -501,6 +507,7 @@
 	const struct i40iw_vf_cqp_ops *iw_vf_cqp_ops;
 
 	struct i40iw_hmc_fpm_misc hmc_fpm_misc;
+	u64 feature_info[I40IW_MAX_FEATURES];
 	u32 debug_mask;
 	u8 hmc_fn_id;
 	bool is_pf;
@@ -1340,6 +1347,12 @@
 			struct i40iw_sc_qp *qp;
 			u64 scratch;
 		} suspend_resume;
+		struct {
+			struct i40iw_sc_cqp *cqp;
+			void *cap_va;
+			u64 cap_pa;
+			u64 scratch;
+		} query_rdma_features;
 	} u;
 };
 
diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c
index 72db7c1..644f8c6 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_utils.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c
@@ -190,9 +190,8 @@
 	switch (event) {
 	case NETDEV_DOWN:
 		action = I40IW_ARP_DELETE;
-		/* Fall through */
+		fallthrough;
 	case NETDEV_UP:
-		/* Fall through */
 	case NETDEV_CHANGEADDR:
 
 		/* Just skip if no need to handle ARP cache */
@@ -247,9 +246,8 @@
 	switch (event) {
 	case NETDEV_DOWN:
 		action = I40IW_ARP_DELETE;
-		/* Fall through */
+		fallthrough;
 	case NETDEV_UP:
-		/* Fall through */
 	case NETDEV_CHANGEADDR:
 		i40iw_manage_arp_cache(iwdev,
 				       netdev->dev_addr,
@@ -344,7 +342,7 @@
 	switch (event) {
 	case NETDEV_DOWN:
 		iwdev->iw_status = 0;
-		/* Fall through */
+		fallthrough;
 	case NETDEV_UP:
 		i40iw_port_ibevent(iwdev);
 		break;
@@ -714,7 +712,7 @@
 					      u64 size,
 					      u32 alignment)
 {
-	struct pci_dev *pcidev = (struct pci_dev *)hw->dev_context;
+	struct pci_dev *pcidev = hw->pcidev;
 
 	if (!mem)
 		return I40IW_ERR_PARAM;
@@ -733,7 +731,7 @@
  */
 void i40iw_free_dma_mem(struct i40iw_hw *hw, struct i40iw_dma_mem *mem)
 {
-	struct pci_dev *pcidev = (struct pci_dev *)hw->dev_context;
+	struct pci_dev *pcidev = hw->pcidev;
 
 	if (!mem || !mem->va)
 		return;
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
index 7e9c1a4..533f3ca 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -64,7 +64,8 @@
 		return -EINVAL;
 	memset(props, 0, sizeof(*props));
 	ether_addr_copy((u8 *)&props->sys_image_guid, iwdev->netdev->dev_addr);
-	props->fw_ver = I40IW_FW_VERSION;
+	props->fw_ver = i40iw_fw_major_ver(&iwdev->sc_dev) << 32 |
+			i40iw_fw_minor_ver(&iwdev->sc_dev);
 	props->device_cap_flags = iwdev->device_cap_flags;
 	props->vendor_id = iwdev->ldev->pcidev->vendor;
 	props->vendor_part_id = iwdev->ldev->pcidev->device;
@@ -82,7 +83,6 @@
 	props->max_qp_rd_atom = I40IW_MAX_IRD_SIZE;
 	props->max_qp_init_rd_atom = props->max_qp_rd_atom;
 	props->atomic_cap = IB_ATOMIC_NONE;
-	props->max_map_per_fmr = 1;
 	props->max_fast_reg_page_list_len = I40IW_MAX_PAGES_PER_FMR;
 	return 0;
 }
@@ -101,7 +101,6 @@
 	props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP |
 		IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
 	props->gid_tbl_len = 1;
-	props->pkey_tbl_len = 1;
 	props->active_width = IB_WIDTH_4X;
 	props->active_speed = 1;
 	props->max_msg_sz = I40IW_MAX_OUTBOUND_MESSAGE_SIZE;
@@ -176,11 +175,8 @@
 
 	dbaddr = I40IW_DB_ADDR_OFFSET + pci_resource_start(ucontext->iwdev->ldev->pcidev, 0);
 
-	if (io_remap_pfn_range(vma, vma->vm_start, dbaddr >> PAGE_SHIFT, PAGE_SIZE,
-			       pgprot_noncached(vma->vm_page_prot)))
-		return -EAGAIN;
-
-	return 0;
+	return rdma_user_mmap_io(context, vma, dbaddr >> PAGE_SHIFT, PAGE_SIZE,
+				 pgprot_noncached(vma->vm_page_prot), NULL);
 }
 
 /**
@@ -309,12 +305,13 @@
  * @ibpd: ptr of pd to be deallocated
  * @udata: user data or null for kernel object
  */
-static void i40iw_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+static int i40iw_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
 	struct i40iw_pd *iwpd = to_iwpd(ibpd);
 	struct i40iw_device *iwdev = to_iwdev(ibpd->device);
 
 	i40iw_rem_pdusecount(iwpd, iwdev);
+	return 0;
 }
 
 /**
@@ -360,7 +357,7 @@
 	i40iw_free_dma_mem(iwdev->sc_dev.hw, &iwqp->kqp.dma_mem);
 	kfree(iwqp->kqp.wrid_mem);
 	iwqp->kqp.wrid_mem = NULL;
-	kfree(iwqp->allocated_buffer);
+	kfree(iwqp);
 }
 
 /**
@@ -517,7 +514,6 @@
 	struct i40iw_create_qp_req req;
 	struct i40iw_create_qp_resp uresp;
 	u32 qp_num = 0;
-	void *mem;
 	enum i40iw_status_code ret;
 	int err_code;
 	int sq_size;
@@ -559,12 +555,10 @@
 	init_info.qp_uk_init_info.max_rq_frag_cnt = init_attr->cap.max_recv_sge;
 	init_info.qp_uk_init_info.max_inline_data = init_attr->cap.max_inline_data;
 
-	mem = kzalloc(sizeof(*iwqp), GFP_KERNEL);
-	if (!mem)
+	iwqp = kzalloc(sizeof(*iwqp), GFP_KERNEL);
+	if (!iwqp)
 		return ERR_PTR(-ENOMEM);
 
-	iwqp = (struct i40iw_qp *)mem;
-	iwqp->allocated_buffer = mem;
 	qp = &iwqp->sc_qp;
 	qp->back_qp = (void *)iwqp;
 	qp->push_idx = I40IW_INVALID_PUSH_PAGE_INDEX;
@@ -609,7 +603,7 @@
 	iwqp->ctx_info.qp_compl_ctx = (uintptr_t)qp;
 
 	if (init_attr->qp_type != IB_QPT_RC) {
-		err_code = -EINVAL;
+		err_code = -EOPNOTSUPP;
 		goto error;
 	}
 	if (iwdev->push_mode)
@@ -804,7 +798,7 @@
 	case I40IW_QP_STATE_RTS:
 		if (iwqp->iwarp_state == I40IW_QP_STATE_IDLE)
 			i40iw_send_reset(iwqp->cm_node);
-		/* fall through */
+		fallthrough;
 	case I40IW_QP_STATE_IDLE:
 	case I40IW_QP_STATE_TERMINATE:
 	case I40IW_QP_STATE_CLOSING:
@@ -1046,7 +1040,7 @@
  * @ib_cq: cq pointer
  * @udata: user data or NULL for kernel object
  */
-static void i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
+static int i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
 	struct i40iw_cq *iwcq;
 	struct i40iw_device *iwdev;
@@ -1058,6 +1052,7 @@
 	i40iw_cq_wq_destroy(iwdev, cq);
 	cq_free_resources(iwdev, iwcq);
 	i40iw_rem_devusecount(iwdev);
+	return 0;
 }
 
 /**
@@ -1314,8 +1309,7 @@
 	if (iwmr->type == IW_MEMREG_TYPE_QP)
 		iwpbl->qp_mr.sq_page = sg_page(region->sg_head.sgl);
 
-	rdma_for_each_block(region->sg_head.sgl, &biter, region->nmap,
-			    iwmr->page_size) {
+	rdma_umem_for_each_dma_block(region, &biter, iwmr->page_size) {
 		*pbl = rdma_block_iter_dma_address(&biter);
 		pbl = i40iw_next_pbl_addr(pbl, &pinfo, &idx);
 	}
@@ -1536,10 +1530,9 @@
  * @pd: ibpd pointer
  * @mr_type: memory for stag registrion
  * @max_num_sg: man number of pages
- * @udata: user data or NULL for kernel objects
  */
 static struct ib_mr *i40iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-				    u32 max_num_sg, struct ib_udata *udata)
+				    u32 max_num_sg)
 {
 	struct i40iw_pd *iwpd = to_iwpd(pd);
 	struct i40iw_device *iwdev = to_iwdev(pd->device);
@@ -1739,22 +1732,22 @@
 	struct i40iw_mr *iwmr;
 	struct ib_umem *region;
 	struct i40iw_mem_reg_req req;
-	u64 pbl_depth = 0;
 	u32 stag = 0;
 	u16 access;
-	u64 region_length;
 	bool use_pbles = false;
 	unsigned long flags;
 	int err = -ENOSYS;
 	int ret;
-	int pg_shift;
+
+	if (!udata)
+		return ERR_PTR(-EOPNOTSUPP);
 
 	if (iwdev->closing)
 		return ERR_PTR(-ENODEV);
 
 	if (length > I40IW_MAX_MR_SIZE)
 		return ERR_PTR(-EINVAL);
-	region = ib_umem_get(udata, start, length, acc, 0);
+	region = ib_umem_get(pd->device, start, length, acc);
 	if (IS_ERR(region))
 		return (struct ib_mr *)region;
 
@@ -1779,18 +1772,13 @@
 	if (req.reg_type == IW_MEMREG_TYPE_MEM)
 		iwmr->page_size = ib_umem_find_best_pgsz(region, SZ_4K | SZ_2M,
 							 virt);
-
-	region_length = region->length + (start & (iwmr->page_size - 1));
-	pg_shift = ffs(iwmr->page_size) - 1;
-	pbl_depth = region_length >> pg_shift;
-	pbl_depth += (region_length & (iwmr->page_size - 1)) ? 1 : 0;
 	iwmr->length = region->length;
 
 	iwpbl->user_base = virt;
 	palloc = &iwpbl->pble_alloc;
 
 	iwmr->type = req.reg_type;
-	iwmr->page_cnt = (u32)pbl_depth;
+	iwmr->page_cnt = ib_umem_num_dma_blocks(region, iwmr->page_size);
 
 	switch (req.reg_type) {
 	case IW_MEMREG_TYPE_QP:
@@ -2136,7 +2124,6 @@
 
 		switch (ib_wr->opcode) {
 		case IB_WR_SEND:
-			/* fall-through */
 		case IB_WR_SEND_WITH_INV:
 			if (ib_wr->opcode == IB_WR_SEND) {
 				if (ib_wr->send_flags & IB_SEND_SOLICITED)
@@ -2193,7 +2180,7 @@
 			break;
 		case IB_WR_RDMA_READ_WITH_INV:
 			inv_stag = true;
-			/* fall-through*/
+			fallthrough;
 		case IB_WR_RDMA_READ:
 			if (ib_wr->num_sge > I40IW_MAX_SGE_RD) {
 				err = -EINVAL;
@@ -2450,7 +2437,6 @@
 	if (err)
 		return err;
 
-	immutable->pkey_tbl_len = attr.pkey_tbl_len;
 	immutable->gid_tbl_len = attr.gid_tbl_len;
 
 	return 0;
@@ -2524,10 +2510,11 @@
 
 static void i40iw_get_dev_fw_str(struct ib_device *dev, char *str)
 {
-	u32 firmware_version = I40IW_FW_VERSION;
+	struct i40iw_device *iwdev = to_iwdev(dev);
 
-	snprintf(str, IB_FW_VERSION_NAME_MAX, "%u.%u", firmware_version,
-		 (firmware_version & 0x000000ff));
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%llu.%llu",
+		 i40iw_fw_major_ver(&iwdev->sc_dev),
+		 i40iw_fw_minor_ver(&iwdev->sc_dev));
 }
 
 /**
@@ -2605,22 +2592,6 @@
 	return 0;
 }
 
-/**
- * i40iw_query_pkey - Query partition key
- * @ibdev: device pointer from stack
- * @port: port number
- * @index: index of pkey
- * @pkey: pointer to store the pkey
- */
-static int i40iw_query_pkey(struct ib_device *ibdev,
-			    u8 port,
-			    u16 index,
-			    u16 *pkey)
-{
-	*pkey = 0;
-	return 0;
-}
-
 static const struct ib_device_ops i40iw_dev_ops = {
 	.owner = THIS_MODULE,
 	.driver_id = RDMA_DRIVER_I40IW,
@@ -2660,7 +2631,6 @@
 	.post_send = i40iw_post_send,
 	.query_device = i40iw_query_device,
 	.query_gid = i40iw_query_gid,
-	.query_pkey = i40iw_query_pkey,
 	.query_port = i40iw_query_port,
 	.query_qp = i40iw_query_qp,
 	.reg_user_mr = i40iw_reg_user_mr,
@@ -2678,7 +2648,7 @@
 {
 	struct i40iw_ib_device *iwibdev;
 	struct net_device *netdev = iwdev->netdev;
-	struct pci_dev *pcidev = (struct pci_dev *)iwdev->hw.dev_context;
+	struct pci_dev *pcidev = iwdev->hw.pcidev;
 
 	iwibdev = ib_alloc_device(i40iw_ib_device, ibdev);
 	if (!iwibdev) {
@@ -2768,7 +2738,8 @@
 	if (ret)
 		goto error;
 
-	ret = ib_register_device(&iwibdev->ibdev, "i40iw%d");
+	dma_set_max_seg_size(&iwdev->hw.pcidev->dev, UINT_MAX);
+	ret = ib_register_device(&iwibdev->ibdev, "i40iw%d", &iwdev->hw.pcidev->dev);
 	if (ret)
 		goto error;
 
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.h b/drivers/infiniband/hw/i40iw/i40iw_verbs.h
index ad7d810..bab71f3 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_verbs.h
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.h
@@ -89,7 +89,6 @@
 	union {
 		struct ib_mr ibmr;
 		struct ib_mw ibmw;
-		struct ib_fmr ibfmr;
 	};
 	struct ib_umem *region;
 	u16 type;
diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig
index cc7c42f..f30ce9d 100644
--- a/drivers/infiniband/hw/mlx4/Kconfig
+++ b/drivers/infiniband/hw/mlx4/Kconfig
@@ -4,7 +4,7 @@
 	depends on NETDEVICES && ETHERNET && PCI && INET
 	select NET_VENDOR_MELLANOX
 	select MLX4_CORE
-	---help---
+	help
 	  This driver provides low-level InfiniBand support for
 	  Mellanox ConnectX PCI Express host channel adapters (HCAs).
 	  This is required to use InfiniBand protocols such as
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
index 02a169f..7321d6a 100644
--- a/drivers/infiniband/hw/mlx4/ah.c
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -141,10 +141,11 @@
 	return 0;
 }
 
-int mlx4_ib_create_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr,
-		      u32 flags, struct ib_udata *udata)
-
+int mlx4_ib_create_ah(struct ib_ah *ib_ah, struct rdma_ah_init_attr *init_attr,
+		      struct ib_udata *udata)
 {
+	struct rdma_ah_attr *ah_attr = init_attr->ah_attr;
+
 	if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
 		if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
 			return -EINVAL;
@@ -167,12 +168,14 @@
 			    int slave_sgid_index, u8 *s_mac, u16 vlan_tag)
 {
 	struct rdma_ah_attr slave_attr = *ah_attr;
+	struct rdma_ah_init_attr init_attr = {};
 	struct mlx4_ib_ah *mah = to_mah(ah);
 	int ret;
 
 	slave_attr.grh.sgid_attr = NULL;
 	slave_attr.grh.sgid_index = slave_sgid_index;
-	ret = mlx4_ib_create_ah(ah, &slave_attr, 0, NULL);
+	init_attr.ah_attr = &slave_attr;
+	ret = mlx4_ib_create_ah(ah, &init_attr, NULL);
 	if (ret)
 		return ret;
 
@@ -229,8 +232,3 @@
 
 	return 0;
 }
-
-void mlx4_ib_destroy_ah(struct ib_ah *ah, u32 flags)
-{
-	return;
-}
diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
index 81d6a34..4aff1c8 100644
--- a/drivers/infiniband/hw/mlx4/cm.c
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -54,11 +54,20 @@
 	struct delayed_work timeout;
 };
 
+struct rej_tmout_entry {
+	int slave;
+	u32 rem_pv_cm_id;
+	struct delayed_work timeout;
+	struct xarray *xa_rej_tmout;
+};
+
 struct cm_generic_msg {
 	struct ib_mad_hdr hdr;
 
 	__be32 local_comm_id;
 	__be32 remote_comm_id;
+	unsigned char unused[2];
+	__be16 rej_reason;
 };
 
 struct cm_sidr_generic_msg {
@@ -288,6 +297,7 @@
 	spin_unlock(&sriov->id_map_lock);
 }
 
+#define REJ_REASON(m) be16_to_cpu(((struct cm_generic_msg *)(m))->rej_reason)
 int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
 		struct ib_mad *mad)
 {
@@ -296,8 +306,10 @@
 	int pv_cm_id = -1;
 
 	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
-			mad->mad_hdr.attr_id == CM_REP_ATTR_ID ||
-			mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+	    mad->mad_hdr.attr_id == CM_REP_ATTR_ID ||
+	    mad->mad_hdr.attr_id == CM_MRA_ATTR_ID ||
+	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID ||
+	    (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID && REJ_REASON(mad) == IB_CM_REJ_TIMEOUT)) {
 		sl_cm_id = get_local_comm_id(mad);
 		id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id);
 		if (id)
@@ -317,8 +329,8 @@
 	}
 
 	if (!id) {
-		pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL!\n",
-			 slave_id, sl_cm_id);
+		pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL! attr_id: 0x%x\n",
+			 slave_id, sl_cm_id, be16_to_cpu(mad->mad_hdr.attr_id));
 		return -EINVAL;
 	}
 
@@ -330,11 +342,94 @@
 	return 0;
 }
 
+static void rej_tmout_timeout(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct rej_tmout_entry *item = container_of(delay, struct rej_tmout_entry, timeout);
+	struct rej_tmout_entry *deleted;
+
+	deleted = xa_cmpxchg(item->xa_rej_tmout, item->rem_pv_cm_id, item, NULL, 0);
+
+	if (deleted != item)
+		pr_debug("deleted(%p) != item(%p)\n", deleted, item);
+
+	kfree(item);
+}
+
+static int alloc_rej_tmout(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id, int slave)
+{
+	struct rej_tmout_entry *item;
+	struct rej_tmout_entry *old;
+	int ret = 0;
+
+	xa_lock(&sriov->xa_rej_tmout);
+	item = xa_load(&sriov->xa_rej_tmout, (unsigned long)rem_pv_cm_id);
+
+	if (item) {
+		if (xa_err(item))
+			ret =  xa_err(item);
+		else
+			/* If a retry, adjust delayed work */
+			mod_delayed_work(system_wq, &item->timeout, CM_CLEANUP_CACHE_TIMEOUT);
+		goto err_or_exists;
+	}
+	xa_unlock(&sriov->xa_rej_tmout);
+
+	item = kmalloc(sizeof(*item), GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+
+	INIT_DELAYED_WORK(&item->timeout, rej_tmout_timeout);
+	item->slave = slave;
+	item->rem_pv_cm_id = rem_pv_cm_id;
+	item->xa_rej_tmout = &sriov->xa_rej_tmout;
+
+	old = xa_cmpxchg(&sriov->xa_rej_tmout, (unsigned long)rem_pv_cm_id, NULL, item, GFP_KERNEL);
+	if (old) {
+		pr_debug(
+			"Non-null old entry (%p) or error (%d) when inserting\n",
+			old, xa_err(old));
+		kfree(item);
+		return xa_err(old);
+	}
+
+	schedule_delayed_work(&item->timeout, CM_CLEANUP_CACHE_TIMEOUT);
+
+	return 0;
+
+err_or_exists:
+	xa_unlock(&sriov->xa_rej_tmout);
+	return ret;
+}
+
+static int lookup_rej_tmout_slave(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id)
+{
+	struct rej_tmout_entry *item;
+	int slave;
+
+	xa_lock(&sriov->xa_rej_tmout);
+	item = xa_load(&sriov->xa_rej_tmout, (unsigned long)rem_pv_cm_id);
+
+	if (!item || xa_err(item)) {
+		pr_debug("Could not find slave. rem_pv_cm_id 0x%x error: %d\n",
+			 rem_pv_cm_id, xa_err(item));
+		slave = !item ? -ENOENT : xa_err(item);
+	} else {
+		slave = item->slave;
+	}
+	xa_unlock(&sriov->xa_rej_tmout);
+
+	return slave;
+}
+
 int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
 			     struct ib_mad *mad)
 {
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+	u32 rem_pv_cm_id = get_local_comm_id(mad);
 	u32 pv_cm_id;
 	struct id_map_entry *id;
+	int sts;
 
 	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
 	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
@@ -350,6 +445,13 @@
 				     be64_to_cpu(gid.global.interface_id));
 			return -ENOENT;
 		}
+
+		sts = alloc_rej_tmout(sriov, rem_pv_cm_id, *slave);
+		if (sts)
+			/* Even if this fails, we pass on the REQ to the slave */
+			pr_debug("Could not allocate rej_tmout entry. rem_pv_cm_id 0x%x slave %d status %d\n",
+				 rem_pv_cm_id, *slave, sts);
+
 		return 0;
 	}
 
@@ -357,7 +459,14 @@
 	id = id_map_get(ibdev, (int *)&pv_cm_id, -1, -1);
 
 	if (!id) {
-		pr_debug("Couldn't find an entry for pv_cm_id 0x%x\n", pv_cm_id);
+		if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID &&
+		    REJ_REASON(mad) == IB_CM_REJ_TIMEOUT && slave) {
+			*slave = lookup_rej_tmout_slave(sriov, rem_pv_cm_id);
+
+			return (*slave < 0) ? *slave : 0;
+		}
+		pr_debug("Couldn't find an entry for pv_cm_id 0x%x, attr_id 0x%x\n",
+			 pv_cm_id, be16_to_cpu(mad->mad_hdr.attr_id));
 		return -ENOENT;
 	}
 
@@ -378,6 +487,34 @@
 	INIT_LIST_HEAD(&dev->sriov.cm_list);
 	dev->sriov.sl_id_map = RB_ROOT;
 	xa_init_flags(&dev->sriov.pv_id_table, XA_FLAGS_ALLOC);
+	xa_init(&dev->sriov.xa_rej_tmout);
+}
+
+static void rej_tmout_xa_cleanup(struct mlx4_ib_sriov *sriov, int slave)
+{
+	struct rej_tmout_entry *item;
+	bool flush_needed = false;
+	unsigned long id;
+	int cnt = 0;
+
+	xa_lock(&sriov->xa_rej_tmout);
+	xa_for_each(&sriov->xa_rej_tmout, id, item) {
+		if (slave < 0 || slave == item->slave) {
+			mod_delayed_work(system_wq, &item->timeout, 0);
+			flush_needed = true;
+			++cnt;
+		}
+	}
+	xa_unlock(&sriov->xa_rej_tmout);
+
+	if (flush_needed) {
+		flush_scheduled_work();
+		pr_debug("Deleted %d entries in xarray for slave %d during cleanup\n",
+			 cnt, slave);
+	}
+
+	if (slave < 0)
+		WARN_ON(!xa_empty(&sriov->xa_rej_tmout));
 }
 
 /* slave = -1 ==> all slaves */
@@ -447,4 +584,6 @@
 		list_del(&map->list);
 		kfree(map);
 	}
+
+	rej_tmout_xa_cleanup(sriov, slave);
 }
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index a7d238d..e9b5a4d 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -144,12 +144,11 @@
 	int shift;
 	int n;
 
-	*umem = ib_umem_get(udata, buf_addr, cqe * cqe_size,
-			    IB_ACCESS_LOCAL_WRITE, 1);
+	*umem = ib_umem_get(&dev->ib_dev, buf_addr, cqe * cqe_size,
+			    IB_ACCESS_LOCAL_WRITE);
 	if (IS_ERR(*umem))
 		return PTR_ERR(*umem);
 
-	n = ib_umem_page_count(*umem);
 	shift = mlx4_ib_umem_calc_optimal_mtt_size(*umem, 0, &n);
 	err = mlx4_mtt_init(dev->dev, n, shift, &buf->mtt);
 
@@ -475,7 +474,7 @@
 	return err;
 }
 
-void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(cq->device);
 	struct mlx4_ib_cq *mcq = to_mcq(cq);
@@ -495,6 +494,7 @@
 		mlx4_db_free(dev->dev, &mcq->db);
 	}
 	ib_umem_release(mcq->umem);
+	return 0;
 }
 
 static void dump_cqe(void *cqe)
@@ -568,18 +568,13 @@
 	wc->vendor_err = cqe->vendor_err_syndrome;
 }
 
-static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum)
+static int mlx4_ib_ipoib_csum_ok(__be16 status, u8 badfcs_enc, __be16 checksum)
 {
-	return ((status & cpu_to_be16(MLX4_CQE_STATUS_IPV4      |
-				      MLX4_CQE_STATUS_IPV4F     |
-				      MLX4_CQE_STATUS_IPV4OPT   |
-				      MLX4_CQE_STATUS_IPV6      |
-				      MLX4_CQE_STATUS_IPOK)) ==
-		cpu_to_be16(MLX4_CQE_STATUS_IPV4        |
-			    MLX4_CQE_STATUS_IPOK))              &&
-		(status & cpu_to_be16(MLX4_CQE_STATUS_UDP       |
-				      MLX4_CQE_STATUS_TCP))     &&
-		checksum == cpu_to_be16(0xffff);
+	return ((badfcs_enc & MLX4_CQE_STATUS_L4_CSUM) ||
+		((status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
+		 (status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
+				       MLX4_CQE_STATUS_UDP)) &&
+		 (checksum == cpu_to_be16(0xffff))));
 }
 
 static void use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc,
@@ -770,13 +765,13 @@
 		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
 		case MLX4_OPCODE_RDMA_WRITE_IMM:
 			wc->wc_flags |= IB_WC_WITH_IMM;
-			/* fall through */
+			fallthrough;
 		case MLX4_OPCODE_RDMA_WRITE:
 			wc->opcode    = IB_WC_RDMA_WRITE;
 			break;
 		case MLX4_OPCODE_SEND_IMM:
 			wc->wc_flags |= IB_WC_WITH_IMM;
-			/* fall through */
+			fallthrough;
 		case MLX4_OPCODE_SEND:
 		case MLX4_OPCODE_SEND_INVAL:
 			wc->opcode    = IB_WC_SEND;
@@ -855,6 +850,7 @@
 		wc->wc_flags	  |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0;
 		wc->pkey_index     = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f;
 		wc->wc_flags	  |= mlx4_ib_ipoib_csum_ok(cqe->status,
+					cqe->badfcs_enc,
 					cqe->checksum) ? IB_WC_IP_CSUM_OK : 0;
 		if (is_eth) {
 			wc->slid = 0;
diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c
index 0f39035..d41f03c 100644
--- a/drivers/infiniband/hw/mlx4/doorbell.c
+++ b/drivers/infiniband/hw/mlx4/doorbell.c
@@ -64,7 +64,8 @@
 
 	page->user_virt = (virt & PAGE_MASK);
 	page->refcnt    = 0;
-	page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0, 0);
+	page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK,
+				 PAGE_SIZE, 0);
 	if (IS_ERR(page->umem)) {
 		err = PTR_ERR(page->umem);
 		kfree(page);
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 08eccf2..8bd1647 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -500,6 +500,13 @@
 					 sgid, dgid);
 }
 
+static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave)
+{
+	int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave;
+
+	return (qpn >= proxy_start && qpn <= proxy_start + 1);
+}
+
 int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
 			  enum ib_qp_type dest_qpt, struct ib_wc *wc,
 			  struct ib_grh *grh, struct ib_mad *mad)
@@ -520,8 +527,10 @@
 	u16 cached_pkey;
 	u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
 
-	if (dest_qpt > IB_QPT_GSI)
+	if (dest_qpt > IB_QPT_GSI) {
+		pr_debug("dest_qpt (%d) > IB_QPT_GSI\n", dest_qpt);
 		return -EINVAL;
+	}
 
 	tun_ctx = dev->sriov.demux[port-1].tun[slave];
 
@@ -538,12 +547,20 @@
 	if (dest_qpt) {
 		u16 pkey_ix;
 		ret = ib_get_cached_pkey(&dev->ib_dev, port, wc->pkey_index, &cached_pkey);
-		if (ret)
+		if (ret) {
+			pr_debug("unable to get %s cached pkey for index %d, ret %d\n",
+				 is_proxy_qp0(dev, wc->src_qp, slave) ? "SMI" : "GSI",
+				 wc->pkey_index, ret);
 			return -EINVAL;
+		}
 
 		ret = find_slave_port_pkey_ix(dev, slave, port, cached_pkey, &pkey_ix);
-		if (ret)
+		if (ret) {
+			pr_debug("unable to get %s pkey ix for pkey 0x%x, ret %d\n",
+				 is_proxy_qp0(dev, wc->src_qp, slave) ? "SMI" : "GSI",
+				 cached_pkey, ret);
 			return -EINVAL;
+		}
 		tun_pkey_ix = pkey_ix;
 	} else
 		tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0];
@@ -715,7 +732,8 @@
 
 		err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad);
 		if (err)
-			pr_debug("failed sending to slave %d via tunnel qp (%d)\n",
+			pr_debug("failed sending %s to slave %d via tunnel qp (%d)\n",
+				 is_proxy_qp0(dev, wc->src_qp, slave) ? "SMI" : "GSI",
 				 slave, err);
 		return 0;
 	}
@@ -794,7 +812,8 @@
 
 	err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad);
 	if (err)
-		pr_debug("failed sending to slave %d via tunnel qp (%d)\n",
+		pr_debug("failed sending %s to slave %d via tunnel qp (%d)\n",
+			 is_proxy_qp0(dev, wc->src_qp, slave) ? "SMI" : "GSI",
 			 slave, err);
 	return 0;
 }
@@ -807,27 +826,6 @@
 	int err;
 	struct ib_port_attr pattr;
 
-	if (in_wc && in_wc->qp) {
-		pr_debug("received MAD: port:%d slid:%d sqpn:%d "
-			 "dlid_bits:%d dqpn:%d wc_flags:0x%x tid:%016llx cls:%x mtd:%x atr:%x\n",
-			 port_num,
-			 in_wc->slid, in_wc->src_qp,
-			 in_wc->dlid_path_bits,
-			 in_wc->qp->qp_num,
-			 in_wc->wc_flags,
-			 be64_to_cpu(in_mad->mad_hdr.tid),
-			 in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method,
-			 be16_to_cpu(in_mad->mad_hdr.attr_id));
-		if (in_wc->wc_flags & IB_WC_GRH) {
-			pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n",
-				 be64_to_cpu(in_grh->sgid.global.subnet_prefix),
-				 be64_to_cpu(in_grh->sgid.global.interface_id));
-			pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n",
-				 be64_to_cpu(in_grh->dgid.global.subnet_prefix),
-				 be64_to_cpu(in_grh->dgid.global.interface_id));
-		}
-	}
-
 	slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
 
 	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) {
@@ -966,7 +964,6 @@
 	}
 	mutex_unlock(&dev->counters_table[port_num - 1].mutex);
 	if (stats_avail) {
-		memset(out_mad->data, 0, sizeof out_mad->data);
 		switch (counter_stats.counter_mode & 0xf) {
 		case 0:
 			edit_counter(&counter_stats,
@@ -984,38 +981,31 @@
 
 int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-			const struct ib_mad_hdr *in, size_t in_mad_size,
-			struct ib_mad_hdr *out, size_t *out_mad_size,
-			u16 *out_mad_pkey_index)
+			const struct ib_mad *in, struct ib_mad *out,
+			size_t *out_mad_size, u16 *out_mad_pkey_index)
 {
 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
-	const struct ib_mad *in_mad = (const struct ib_mad *)in;
-	struct ib_mad *out_mad = (struct ib_mad *)out;
 	enum rdma_link_layer link = rdma_port_get_link_layer(ibdev, port_num);
 
-	if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
-			 *out_mad_size != sizeof(*out_mad)))
-		return IB_MAD_RESULT_FAILURE;
-
 	/* iboe_process_mad() which uses the HCA flow-counters to implement IB PMA
 	 * queries, should be called only by VFs and for that specific purpose
 	 */
 	if (link == IB_LINK_LAYER_INFINIBAND) {
 		if (mlx4_is_slave(dev->dev) &&
-		    (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT &&
-		     (in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS ||
-		      in_mad->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT ||
-		      in_mad->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO)))
-			return iboe_process_mad(ibdev, mad_flags, port_num, in_wc,
-						in_grh, in_mad, out_mad);
+		    (in->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT &&
+		     (in->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS ||
+		      in->mad_hdr.attr_id == IB_PMA_PORT_COUNTERS_EXT ||
+		      in->mad_hdr.attr_id == IB_PMA_CLASS_PORT_INFO)))
+			return iboe_process_mad(ibdev, mad_flags, port_num,
+						in_wc, in_grh, in, out);
 
-		return ib_process_mad(ibdev, mad_flags, port_num, in_wc,
-				      in_grh, in_mad, out_mad);
+		return ib_process_mad(ibdev, mad_flags, port_num, in_wc, in_grh,
+				      in, out);
 	}
 
 	if (link == IB_LINK_LAYER_ETHERNET)
 		return iboe_process_mad(ibdev, mad_flags, port_num, in_wc,
-					in_grh, in_mad, out_mad);
+					in_grh, in, out);
 
 	return -EINVAL;
 }
@@ -1361,14 +1351,6 @@
 	return ret;
 }
 
-static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave)
-{
-	int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave;
-
-	return (qpn >= proxy_start && qpn <= proxy_start + 1);
-}
-
-
 int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
 			 enum ib_qp_type dest_qpt, u16 pkey_index,
 			 u32 remote_qpn, u32 qkey, struct rdma_ah_attr *attr,
@@ -1421,10 +1403,10 @@
 
 	spin_lock(&sqp->tx_lock);
 	if (sqp->tx_ix_head - sqp->tx_ix_tail >=
-	    (MLX4_NUM_TUNNEL_BUFS - 1))
+	    (MLX4_NUM_WIRE_BUFS - 1))
 		ret = -EAGAIN;
 	else
-		wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1);
+		wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_WIRE_BUFS - 1);
 	spin_unlock(&sqp->tx_lock);
 	if (ret)
 		goto out;
@@ -1504,6 +1486,7 @@
 	u16 vlan_id;
 	u8 qos;
 	u8 *dmac;
+	int sts;
 
 	/* Get slave that sent this packet */
 	if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn ||
@@ -1600,13 +1583,17 @@
 					&vlan_id, &qos))
 		rdma_ah_set_sl(&ah_attr, qos);
 
-	mlx4_ib_send_to_wire(dev, slave, ctx->port,
-			     is_proxy_qp0(dev, wc->src_qp, slave) ?
-			     IB_QPT_SMI : IB_QPT_GSI,
-			     be16_to_cpu(tunnel->hdr.pkey_index),
-			     be32_to_cpu(tunnel->hdr.remote_qpn),
-			     be32_to_cpu(tunnel->hdr.qkey),
-			     &ah_attr, wc->smac, vlan_id, &tunnel->mad);
+	sts = mlx4_ib_send_to_wire(dev, slave, ctx->port,
+				   is_proxy_qp0(dev, wc->src_qp, slave) ?
+				   IB_QPT_SMI : IB_QPT_GSI,
+				   be16_to_cpu(tunnel->hdr.pkey_index),
+				   be32_to_cpu(tunnel->hdr.remote_qpn),
+				   be32_to_cpu(tunnel->hdr.qkey),
+				   &ah_attr, wc->smac, vlan_id, &tunnel->mad);
+	if (sts)
+		pr_debug("failed sending %s to wire on behalf of slave %d (%d)\n",
+			 is_proxy_qp0(dev, wc->src_qp, slave) ? "SMI" : "GSI",
+			 slave, sts);
 }
 
 static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
@@ -1615,19 +1602,20 @@
 	int i;
 	struct mlx4_ib_demux_pv_qp *tun_qp;
 	int rx_buf_size, tx_buf_size;
+	const int nmbr_bufs = is_tun ? MLX4_NUM_TUNNEL_BUFS : MLX4_NUM_WIRE_BUFS;
 
 	if (qp_type > IB_QPT_GSI)
 		return -EINVAL;
 
 	tun_qp = &ctx->qp[qp_type];
 
-	tun_qp->ring = kcalloc(MLX4_NUM_TUNNEL_BUFS,
+	tun_qp->ring = kcalloc(nmbr_bufs,
 			       sizeof(struct mlx4_ib_buf),
 			       GFP_KERNEL);
 	if (!tun_qp->ring)
 		return -ENOMEM;
 
-	tun_qp->tx_ring = kcalloc(MLX4_NUM_TUNNEL_BUFS,
+	tun_qp->tx_ring = kcalloc(nmbr_bufs,
 				  sizeof (struct mlx4_ib_tun_tx_buf),
 				  GFP_KERNEL);
 	if (!tun_qp->tx_ring) {
@@ -1644,7 +1632,7 @@
 		tx_buf_size = sizeof (struct mlx4_mad_snd_buf);
 	}
 
-	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+	for (i = 0; i < nmbr_bufs; i++) {
 		tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL);
 		if (!tun_qp->ring[i].addr)
 			goto err;
@@ -1658,7 +1646,7 @@
 		}
 	}
 
-	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+	for (i = 0; i < nmbr_bufs; i++) {
 		tun_qp->tx_ring[i].buf.addr =
 			kmalloc(tx_buf_size, GFP_KERNEL);
 		if (!tun_qp->tx_ring[i].buf.addr)
@@ -1689,7 +1677,7 @@
 				    tx_buf_size, DMA_TO_DEVICE);
 		kfree(tun_qp->tx_ring[i].buf.addr);
 	}
-	i = MLX4_NUM_TUNNEL_BUFS;
+	i = nmbr_bufs;
 err:
 	while (i > 0) {
 		--i;
@@ -1710,6 +1698,7 @@
 	int i;
 	struct mlx4_ib_demux_pv_qp *tun_qp;
 	int rx_buf_size, tx_buf_size;
+	const int nmbr_bufs = is_tun ? MLX4_NUM_TUNNEL_BUFS : MLX4_NUM_WIRE_BUFS;
 
 	if (qp_type > IB_QPT_GSI)
 		return;
@@ -1724,13 +1713,13 @@
 	}
 
 
-	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+	for (i = 0; i < nmbr_bufs; i++) {
 		ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map,
 				    rx_buf_size, DMA_FROM_DEVICE);
 		kfree(tun_qp->ring[i].addr);
 	}
 
-	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+	for (i = 0; i < nmbr_bufs; i++) {
 		ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map,
 				    tx_buf_size, DMA_TO_DEVICE);
 		kfree(tun_qp->tx_ring[i].buf.addr);
@@ -1764,9 +1753,6 @@
 					       "buf:%lld\n", wc.wr_id);
 				break;
 			case IB_WC_SEND:
-				pr_debug("received tunnel send completion:"
-					 "wrid=0x%llx, status=0x%x\n",
-					 wc.wr_id, wc.status);
 				rdma_destroy_ah(tun_qp->tx_ring[wc.wr_id &
 					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah, 0);
 				tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
@@ -1813,6 +1799,7 @@
 	struct mlx4_ib_qp_tunnel_init_attr qp_init_attr;
 	struct ib_qp_attr attr;
 	int qp_attr_mask_INIT;
+	const int nmbr_bufs = create_tun ? MLX4_NUM_TUNNEL_BUFS : MLX4_NUM_WIRE_BUFS;
 
 	if (qp_type > IB_QPT_GSI)
 		return -EINVAL;
@@ -1823,8 +1810,8 @@
 	qp_init_attr.init_attr.send_cq = ctx->cq;
 	qp_init_attr.init_attr.recv_cq = ctx->cq;
 	qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
-	qp_init_attr.init_attr.cap.max_send_wr = MLX4_NUM_TUNNEL_BUFS;
-	qp_init_attr.init_attr.cap.max_recv_wr = MLX4_NUM_TUNNEL_BUFS;
+	qp_init_attr.init_attr.cap.max_send_wr = nmbr_bufs;
+	qp_init_attr.init_attr.cap.max_recv_wr = nmbr_bufs;
 	qp_init_attr.init_attr.cap.max_send_sge = 1;
 	qp_init_attr.init_attr.cap.max_recv_sge = 1;
 	if (create_tun) {
@@ -1886,7 +1873,7 @@
 		goto err_qp;
 	}
 
-	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+	for (i = 0; i < nmbr_bufs; i++) {
 		ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i);
 		if (ret) {
 			pr_err(" mlx4_ib_post_pv_buf error"
@@ -1922,8 +1909,8 @@
 			switch (wc.opcode) {
 			case IB_WC_SEND:
 				kfree(sqp->tx_ring[wc.wr_id &
-				      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
-				sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+				      (MLX4_NUM_WIRE_BUFS - 1)].ah);
+				sqp->tx_ring[wc.wr_id & (MLX4_NUM_WIRE_BUFS - 1)].ah
 					= NULL;
 				spin_lock(&sqp->tx_lock);
 				sqp->tx_ix_tail++;
@@ -1932,13 +1919,13 @@
 			case IB_WC_RECV:
 				mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *)
 						(sqp->ring[wc.wr_id &
-						(MLX4_NUM_TUNNEL_BUFS - 1)].addr))->payload);
+						(MLX4_NUM_WIRE_BUFS - 1)].addr))->payload);
 				grh = &(((struct mlx4_mad_rcv_buf *)
 						(sqp->ring[wc.wr_id &
-						(MLX4_NUM_TUNNEL_BUFS - 1)].addr))->grh);
+						(MLX4_NUM_WIRE_BUFS - 1)].addr))->grh);
 				mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad);
 				if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id &
-							   (MLX4_NUM_TUNNEL_BUFS - 1)))
+							   (MLX4_NUM_WIRE_BUFS - 1)))
 					pr_err("Failed reposting SQP "
 					       "buf:%lld\n", wc.wr_id);
 				break;
@@ -1951,8 +1938,8 @@
 				 ctx->slave, wc.status, wc.wr_id);
 			if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
 				kfree(sqp->tx_ring[wc.wr_id &
-				      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
-				sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+				      (MLX4_NUM_WIRE_BUFS - 1)].ah);
+				sqp->tx_ring[wc.wr_id & (MLX4_NUM_WIRE_BUFS - 1)].ah
 					= NULL;
 				spin_lock(&sqp->tx_lock);
 				sqp->tx_ix_tail++;
@@ -1992,6 +1979,7 @@
 {
 	int ret, cq_size;
 	struct ib_cq_init_attr cq_attr = {};
+	const int nmbr_bufs = create_tun ? MLX4_NUM_TUNNEL_BUFS : MLX4_NUM_WIRE_BUFS;
 
 	if (ctx->state != DEMUX_PV_STATE_DOWN)
 		return -EEXIST;
@@ -2016,7 +2004,7 @@
 		goto err_out_qp0;
 	}
 
-	cq_size = 2 * MLX4_NUM_TUNNEL_BUFS;
+	cq_size = 2 * nmbr_bufs;
 	if (ctx->has_smi)
 		cq_size *= 2;
 
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index 4886426..05c7200 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -263,6 +263,8 @@
 	int hw_update = 0;
 	int i;
 	struct gid_entry *gids = NULL;
+	u16 vlan_id = 0xffff;
+	u8 mac[ETH_ALEN];
 
 	if (!rdma_cap_roce_gid_table(attr->device, attr->port_num))
 		return -EINVAL;
@@ -273,12 +275,16 @@
 	if (!context)
 		return -EINVAL;
 
+	ret = rdma_read_gid_l2_fields(attr, &vlan_id, &mac[0]);
+	if (ret)
+		return ret;
 	port_gid_table = &iboe->gids[attr->port_num - 1];
 	spin_lock_bh(&iboe->lock);
 	for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) {
 		if (!memcmp(&port_gid_table->gids[i].gid,
 			    &attr->gid, sizeof(attr->gid)) &&
-		    port_gid_table->gids[i].gid_type == attr->gid_type)  {
+		    port_gid_table->gids[i].gid_type == attr->gid_type &&
+		    port_gid_table->gids[i].vlan_id == vlan_id)  {
 			found = i;
 			break;
 		}
@@ -298,6 +304,7 @@
 				memcpy(&port_gid_table->gids[free].gid,
 				       &attr->gid, sizeof(attr->gid));
 				port_gid_table->gids[free].gid_type = attr->gid_type;
+				port_gid_table->gids[free].vlan_id = vlan_id;
 				port_gid_table->gids[free].ctx->real_index = free;
 				port_gid_table->gids[free].ctx->refcount = 1;
 				hw_update = 1;
@@ -427,9 +434,6 @@
 	return real_index;
 }
 
-#define field_avail(type, fld, sz) (offsetof(type, fld) + \
-				    sizeof(((type *)0)->fld) <= (sz))
-
 static int mlx4_ib_query_device(struct ib_device *ibdev,
 				struct ib_device_attr *props,
 				struct ib_udata *uhw)
@@ -440,7 +444,7 @@
 	int err;
 	int have_ib_ports;
 	struct mlx4_uverbs_ex_query_device cmd;
-	struct mlx4_uverbs_ex_query_device_resp resp = {.comp_mask = 0};
+	struct mlx4_uverbs_ex_query_device_resp resp = {};
 	struct mlx4_clock_params clock_params;
 
 	if (uhw->inlen) {
@@ -554,7 +558,6 @@
 	props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm;
 	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
 					   props->max_mcast_grp;
-	props->max_map_per_fmr = dev->dev->caps.max_fmr_maps;
 	props->hca_core_clock = dev->dev->caps.hca_core_clock * 1000UL;
 	props->timestamp_mask = 0xFFFFFFFFFFFFULL;
 	props->max_ah = INT_MAX;
@@ -592,7 +595,7 @@
 			sizeof(struct mlx4_wqe_data_seg);
 	}
 
-	if (field_avail(typeof(resp), rss_caps, uhw->outlen)) {
+	if (offsetofend(typeof(resp), rss_caps) <= uhw->outlen) {
 		if (props->rss_caps.supported_qpts) {
 			resp.rss_caps.rx_hash_function =
 				MLX4_IB_RX_HASH_FUNC_TOEPLITZ;
@@ -616,7 +619,7 @@
 				       sizeof(resp.rss_caps);
 	}
 
-	if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
+	if (offsetofend(typeof(resp), tso_caps) <= uhw->outlen) {
 		if (dev->dev->caps.max_gso_sz &&
 		    ((mlx4_ib_port_link_layer(ibdev, 1) ==
 		    IB_LINK_LAYER_ETHERNET) ||
@@ -1156,7 +1159,8 @@
 		return rdma_user_mmap_io(context, vma,
 					 to_mucontext(context)->uar.pfn,
 					 PAGE_SIZE,
-					 pgprot_noncached(vma->vm_page_prot));
+					 pgprot_noncached(vma->vm_page_prot),
+					 NULL);
 
 	case 1:
 		if (dev->dev->caps.bf_reg_size == 0)
@@ -1165,7 +1169,8 @@
 			context, vma,
 			to_mucontext(context)->uar.pfn +
 				dev->dev->caps.num_uars,
-			PAGE_SIZE, pgprot_writecombine(vma->vm_page_prot));
+			PAGE_SIZE, pgprot_writecombine(vma->vm_page_prot),
+			NULL);
 
 	case 3: {
 		struct mlx4_clock_params params;
@@ -1181,7 +1186,8 @@
 					    params.bar) +
 			 params.offset) >>
 				PAGE_SHIFT,
-			PAGE_SIZE, pgprot_noncached(vma->vm_page_prot));
+			PAGE_SIZE, pgprot_noncached(vma->vm_page_prot),
+			NULL);
 	}
 
 	default:
@@ -1206,51 +1212,46 @@
 	return 0;
 }
 
-static void mlx4_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+static int mlx4_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
 	mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
+	return 0;
 }
 
-static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev,
-					  struct ib_udata *udata)
+static int mlx4_ib_alloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata)
 {
-	struct mlx4_ib_xrcd *xrcd;
+	struct mlx4_ib_dev *dev = to_mdev(ibxrcd->device);
+	struct mlx4_ib_xrcd *xrcd = to_mxrcd(ibxrcd);
 	struct ib_cq_init_attr cq_attr = {};
 	int err;
 
-	if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
-		return ERR_PTR(-ENOSYS);
+	if (!(dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+		return -EOPNOTSUPP;
 
-	xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL);
-	if (!xrcd)
-		return ERR_PTR(-ENOMEM);
-
-	err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn);
+	err = mlx4_xrcd_alloc(dev->dev, &xrcd->xrcdn);
 	if (err)
-		goto err1;
+		return err;
 
-	xrcd->pd = ib_alloc_pd(ibdev, 0);
+	xrcd->pd = ib_alloc_pd(ibxrcd->device, 0);
 	if (IS_ERR(xrcd->pd)) {
 		err = PTR_ERR(xrcd->pd);
 		goto err2;
 	}
 
 	cq_attr.cqe = 1;
-	xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, &cq_attr);
+	xrcd->cq = ib_create_cq(ibxrcd->device, NULL, NULL, xrcd, &cq_attr);
 	if (IS_ERR(xrcd->cq)) {
 		err = PTR_ERR(xrcd->cq);
 		goto err3;
 	}
 
-	return &xrcd->ibxrcd;
+	return 0;
 
 err3:
 	ib_dealloc_pd(xrcd->pd);
 err2:
-	mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn);
-err1:
-	kfree(xrcd);
-	return ERR_PTR(err);
+	mlx4_xrcd_free(dev->dev, xrcd->xrcdn);
+	return err;
 }
 
 static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
@@ -1258,8 +1259,6 @@
 	ib_destroy_cq(to_mxrcd(xrcd)->cq);
 	ib_dealloc_pd(to_mxrcd(xrcd)->pd);
 	mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn);
-	kfree(xrcd);
-
 	return 0;
 }
 
@@ -1533,23 +1532,11 @@
 	struct mlx4_net_trans_rule_hw_ctrl *ctrl;
 	int default_flow;
 
-	static const u16 __mlx4_domain[] = {
-		[IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS,
-		[IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL,
-		[IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS,
-		[IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC,
-	};
-
 	if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) {
 		pr_err("Invalid priority value %d\n", flow_attr->priority);
 		return -EINVAL;
 	}
 
-	if (domain >= IB_FLOW_DOMAIN_NUM) {
-		pr_err("Invalid domain value %d\n", domain);
-		return -EINVAL;
-	}
-
 	if (mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0)
 		return -EINVAL;
 
@@ -1558,8 +1545,7 @@
 		return PTR_ERR(mailbox);
 	ctrl = mailbox->buf;
 
-	ctrl->prio = cpu_to_be16(__mlx4_domain[domain] |
-				 flow_attr->priority);
+	ctrl->prio = cpu_to_be16(domain | flow_attr->priority);
 	ctrl->type = mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type);
 	ctrl->port = flow_attr->port;
 	ctrl->qpn = cpu_to_be32(qp->qp_num);
@@ -1701,8 +1687,8 @@
 }
 
 static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,
-				    struct ib_flow_attr *flow_attr,
-				    int domain, struct ib_udata *udata)
+					   struct ib_flow_attr *flow_attr,
+					   struct ib_udata *udata)
 {
 	int err = 0, i = 0, j = 0;
 	struct mlx4_ib_flow *mflow;
@@ -1768,8 +1754,8 @@
 	}
 
 	while (i < ARRAY_SIZE(type) && type[i]) {
-		err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i],
-					    &mflow->reg_id[i].id);
+		err = __mlx4_ib_create_flow(qp, flow_attr, MLX4_DOMAIN_UVERBS,
+					    type[i], &mflow->reg_id[i].id);
 		if (err)
 			goto err_create_flow;
 		if (is_bonded) {
@@ -1778,7 +1764,7 @@
 			 */
 			flow_attr->port = 2;
 			err = __mlx4_ib_create_flow(qp, flow_attr,
-						    domain, type[j],
+						    MLX4_DOMAIN_UVERBS, type[j],
 						    &mflow->reg_id[j].mirror);
 			flow_attr->port = 1;
 			if (err)
@@ -2589,23 +2575,23 @@
 	.destroy_rwq_ind_table = mlx4_ib_destroy_rwq_ind_table,
 	.destroy_wq = mlx4_ib_destroy_wq,
 	.modify_wq = mlx4_ib_modify_wq,
-};
 
-static const struct ib_device_ops mlx4_ib_dev_fmr_ops = {
-	.alloc_fmr = mlx4_ib_fmr_alloc,
-	.dealloc_fmr = mlx4_ib_fmr_dealloc,
-	.map_phys_fmr = mlx4_ib_map_phys_fmr,
-	.unmap_fmr = mlx4_ib_unmap_fmr,
+	INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx4_ib_rwq_ind_table,
+			   ib_rwq_ind_tbl),
 };
 
 static const struct ib_device_ops mlx4_ib_dev_mw_ops = {
 	.alloc_mw = mlx4_ib_alloc_mw,
 	.dealloc_mw = mlx4_ib_dealloc_mw,
+
+	INIT_RDMA_OBJ_SIZE(ib_mw, mlx4_ib_mw, ibmw),
 };
 
 static const struct ib_device_ops mlx4_ib_dev_xrc_ops = {
 	.alloc_xrcd = mlx4_ib_alloc_xrcd,
 	.dealloc_xrcd = mlx4_ib_dealloc_xrcd,
+
+	INIT_RDMA_OBJ_SIZE(ib_xrcd, mlx4_ib_xrcd, ibxrcd),
 };
 
 static const struct ib_device_ops mlx4_ib_dev_fs_ops = {
@@ -2715,9 +2701,6 @@
 		ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_wq_ops);
 	}
 
-	if (!mlx4_is_slave(ibdev->dev))
-		ib_set_device_ops(&ibdev->ib_dev, &mlx4_ib_dev_fmr_ops);
-
 	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
 	    dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
 		ibdev->ib_dev.uverbs_cmd_mask |=
@@ -2855,7 +2838,8 @@
 		goto err_steer_free_bitmap;
 
 	rdma_set_device_sysfs_group(&ibdev->ib_dev, &mlx4_attr_group);
-	if (ib_register_device(&ibdev->ib_dev, "mlx4_%d"))
+	if (ib_register_device(&ibdev->ib_dev, "mlx4_%d",
+			       &dev->persist->pdev->dev))
 		goto err_diag_counters;
 
 	if (mlx4_ib_mad_init(ibdev))
@@ -2997,10 +2981,8 @@
 		/* Add an empty rule for IB L2 */
 		memset(&ib_spec->mask, 0, sizeof(ib_spec->mask));
 
-		err = __mlx4_ib_create_flow(&mqp->ibqp, flow,
-					    IB_FLOW_DOMAIN_NIC,
-					    MLX4_FS_REGULAR,
-					    &mqp->reg_id);
+		err = __mlx4_ib_create_flow(&mqp->ibqp, flow, MLX4_DOMAIN_NIC,
+					    MLX4_FS_REGULAR, &mqp->reg_id);
 	} else {
 		err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id);
 	}
@@ -3291,7 +3273,7 @@
 	case MLX4_DEV_EVENT_PORT_MGMT_CHANGE:
 		ew = kmalloc(sizeof *ew, GFP_ATOMIC);
 		if (!ew)
-			break;
+			return;
 
 		INIT_WORK(&ew->work, handle_port_mgmt_change_event);
 		memcpy(&ew->ib_eqe, eqe, sizeof *eqe);
diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
index d844831..5e4ec97 100644
--- a/drivers/infiniband/hw/mlx4/mcg.c
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -944,7 +944,7 @@
 	switch (sa_mad->mad_hdr.method) {
 	case IB_MGMT_METHOD_SET:
 		may_create = 1;
-		/* fall through */
+		fallthrough;
 	case IB_SA_METHOD_DELETE:
 		req = kzalloc(sizeof *req, GFP_KERNEL);
 		if (!req)
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 0173e39..58df064 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -146,11 +146,6 @@
 	struct mlx4_mw		mmw;
 };
 
-struct mlx4_ib_fmr {
-	struct ib_fmr           ibfmr;
-	struct mlx4_fmr         mfmr;
-};
-
 #define MAX_REGS_PER_FLOW 2
 
 struct mlx4_flow_reg_id {
@@ -238,7 +233,8 @@
 };
 
 enum {
-	MLX4_NUM_TUNNEL_BUFS		= 256,
+	MLX4_NUM_TUNNEL_BUFS		= 512,
+	MLX4_NUM_WIRE_BUFS		= 2048,
 };
 
 struct mlx4_ib_tunnel_header {
@@ -303,6 +299,26 @@
 	u8			rss_key[MLX4_EN_RSS_KEY_SIZE];
 };
 
+enum {
+	/*
+	 * Largest possible UD header: send with GRH and immediate
+	 * data plus 18 bytes for an Ethernet header with VLAN/802.1Q
+	 * tag.  (LRH would only use 8 bytes, so Ethernet is the
+	 * biggest case)
+	 */
+	MLX4_IB_UD_HEADER_SIZE		= 82,
+	MLX4_IB_LSO_HEADER_SPARE	= 128,
+};
+
+struct mlx4_ib_sqp {
+	int pkey_index;
+	u32 qkey;
+	u32 send_psn;
+	struct ib_ud_header ud_header;
+	u8 header_buf[MLX4_IB_UD_HEADER_SIZE];
+	struct ib_qp *roce_v2_gsi;
+};
+
 struct mlx4_ib_qp {
 	union {
 		struct ib_qp	ibqp;
@@ -348,7 +364,10 @@
 	struct mlx4_wqn_range	*wqn_range;
 	/* Number of RSS QP parents that uses this WQ */
 	u32			rss_usecnt;
-	struct mlx4_ib_rss	*rss_ctx;
+	union {
+		struct mlx4_ib_rss *rss_ctx;
+		struct mlx4_ib_sqp *sqp;
+	};
 };
 
 struct mlx4_ib_srq {
@@ -371,6 +390,10 @@
 	union mlx4_ext_av       av;
 };
 
+struct mlx4_ib_rwq_ind_table {
+	struct ib_rwq_ind_table ib_rwq_ind_tbl;
+};
+
 /****************************************/
 /* alias guid support */
 /****************************************/
@@ -499,6 +522,7 @@
 	spinlock_t id_map_lock;
 	struct rb_root sl_id_map;
 	struct list_head cm_list;
+	struct xarray xa_rej_tmout;
 };
 
 struct gid_cache_context {
@@ -510,6 +534,7 @@
 	union ib_gid	gid;
 	enum ib_gid_type gid_type;
 	struct gid_cache_context *ctx;
+	u16 vlan_id;
 };
 
 struct mlx4_port_gid_table {
@@ -680,11 +705,6 @@
 	return container_of(ibmw, struct mlx4_ib_mw, ibmw);
 }
 
-static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr)
-{
-	return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr);
-}
-
 static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow)
 {
 	return container_of(ibflow, struct mlx4_ib_flow, ibflow);
@@ -736,36 +756,38 @@
 				  u64 virt_addr, int access_flags,
 				  struct ib_udata *udata);
 int mlx4_ib_dereg_mr(struct ib_mr *mr, struct ib_udata *udata);
-struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
-			       struct ib_udata *udata);
+int mlx4_ib_alloc_mw(struct ib_mw *mw, struct ib_udata *udata);
 int mlx4_ib_dealloc_mw(struct ib_mw *mw);
 struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			       u32 max_num_sg, struct ib_udata *udata);
+			       u32 max_num_sg);
 int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		      unsigned int *sg_offset);
 int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
 int mlx4_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		      struct ib_udata *udata);
-void mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
 void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
 void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
 
-int mlx4_ib_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
+int mlx4_ib_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr,
 		      struct ib_udata *udata);
 int mlx4_ib_create_ah_slave(struct ib_ah *ah, struct rdma_ah_attr *ah_attr,
 			    int slave_sgid_index, u8 *s_mac, u16 vlan_tag);
 int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
-void mlx4_ib_destroy_ah(struct ib_ah *ah, u32 flags);
+static inline int mlx4_ib_destroy_ah(struct ib_ah *ah, u32 flags)
+{
+	return 0;
+}
 
 int mlx4_ib_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *init_attr,
 		       struct ib_udata *udata);
 int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
 int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
-void mlx4_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
+int mlx4_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
 void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index);
 int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 			  const struct ib_recv_wr **bad_wr);
@@ -788,20 +810,13 @@
 int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
 		 int port, const struct ib_wc *in_wc, const struct ib_grh *in_grh,
 		 const void *in_mad, void *response_mad);
-int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-			const struct ib_mad_hdr *in, size_t in_mad_size,
-			struct ib_mad_hdr *out, size_t *out_mad_size,
-			u16 *out_mad_pkey_index);
+			const struct ib_mad *in, struct ib_mad *out,
+			size_t *out_mad_size, u16 *out_mad_pkey_index);
 int mlx4_ib_mad_init(struct mlx4_ib_dev *dev);
 void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev);
 
-struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags,
-				  struct ib_fmr_attr *fmr_attr);
-int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages,
-			 u64 iova);
-int mlx4_ib_unmap_fmr(struct list_head *fmr_list);
-int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr);
 int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
 			 struct ib_port_attr *props, int netw_view);
 int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
@@ -908,15 +923,18 @@
 struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
 				struct ib_wq_init_attr *init_attr,
 				struct ib_udata *udata);
-void mlx4_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
+int mlx4_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
 int mlx4_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
 		      u32 wq_attr_mask, struct ib_udata *udata);
 
-struct ib_rwq_ind_table
-*mlx4_ib_create_rwq_ind_table(struct ib_device *device,
-			      struct ib_rwq_ind_table_init_attr *init_attr,
-			      struct ib_udata *udata);
-int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
+int mlx4_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl,
+				 struct ib_rwq_ind_table_init_attr *init_attr,
+				 struct ib_udata *udata);
+static inline int
+mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table)
+{
+	return 0;
+}
 int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va,
 				       int *num_of_mtts);
 
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 9114cb7..426fed0 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -271,6 +271,8 @@
 	u64 total_len = 0;
 	int i;
 
+	*num_of_mtts = ib_umem_num_dma_blocks(umem, PAGE_SIZE);
+
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
 		/*
 		 * Initialization - save the first chunk start as the
@@ -367,7 +369,7 @@
 	return block_shift;
 }
 
-static struct ib_umem *mlx4_get_umem_mr(struct ib_udata *udata, u64 start,
+static struct ib_umem *mlx4_get_umem_mr(struct ib_device *device, u64 start,
 					u64 length, int access_flags)
 {
 	/*
@@ -380,7 +382,7 @@
 		unsigned long untagged_start = untagged_addr(start);
 		struct vm_area_struct *vma;
 
-		down_read(&current->mm->mmap_sem);
+		mmap_read_lock(current->mm);
 		/*
 		 * FIXME: Ideally this would iterate over all the vmas that
 		 * cover the memory, but for now it requires a single vma to
@@ -395,10 +397,10 @@
 			access_flags |= IB_ACCESS_LOCAL_WRITE;
 		}
 
-		up_read(&current->mm->mmap_sem);
+		mmap_read_unlock(current->mm);
 	}
 
-	return ib_umem_get(udata, start, length, access_flags, 0);
+	return ib_umem_get(device, start, length, access_flags);
 }
 
 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
@@ -415,13 +417,12 @@
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	mr->umem = mlx4_get_umem_mr(udata, start, length, access_flags);
+	mr->umem = mlx4_get_umem_mr(pd->device, start, length, access_flags);
 	if (IS_ERR(mr->umem)) {
 		err = PTR_ERR(mr->umem);
 		goto err_free;
 	}
 
-	n = ib_umem_page_count(mr->umem);
 	shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start, &n);
 
 	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
@@ -503,7 +504,7 @@
 
 		mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
 		ib_umem_release(mmr->umem);
-		mmr->umem = mlx4_get_umem_mr(udata, start, length,
+		mmr->umem = mlx4_get_umem_mr(mr->device, start, length,
 					     mr_access_flags);
 		if (IS_ERR(mmr->umem)) {
 			err = PTR_ERR(mmr->umem);
@@ -511,7 +512,7 @@
 			mmr->umem = NULL;
 			goto release_mpt_entry;
 		}
-		n = ib_umem_page_count(mmr->umem);
+		n = ib_umem_num_dma_blocks(mmr->umem, PAGE_SIZE);
 		shift = PAGE_SHIFT;
 
 		err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr,
@@ -610,37 +611,27 @@
 	return 0;
 }
 
-struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
-			       struct ib_udata *udata)
+int mlx4_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
 {
-	struct mlx4_ib_dev *dev = to_mdev(pd->device);
-	struct mlx4_ib_mw *mw;
+	struct mlx4_ib_dev *dev = to_mdev(ibmw->device);
+	struct mlx4_ib_mw *mw = to_mmw(ibmw);
 	int err;
 
-	mw = kmalloc(sizeof(*mw), GFP_KERNEL);
-	if (!mw)
-		return ERR_PTR(-ENOMEM);
-
-	err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn,
-			    to_mlx4_type(type), &mw->mmw);
+	err = mlx4_mw_alloc(dev->dev, to_mpd(ibmw->pd)->pdn,
+			    to_mlx4_type(ibmw->type), &mw->mmw);
 	if (err)
-		goto err_free;
+		return err;
 
 	err = mlx4_mw_enable(dev->dev, &mw->mmw);
 	if (err)
 		goto err_mw;
 
-	mw->ibmw.rkey = mw->mmw.key;
-
-	return &mw->ibmw;
+	ibmw->rkey = mw->mmw.key;
+	return 0;
 
 err_mw:
 	mlx4_mw_free(dev->dev, &mw->mmw);
-
-err_free:
-	kfree(mw);
-
-	return ERR_PTR(err);
+	return err;
 }
 
 int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
@@ -648,13 +639,11 @@
 	struct mlx4_ib_mw *mw = to_mmw(ibmw);
 
 	mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw);
-	kfree(mw);
-
 	return 0;
 }
 
 struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			       u32 max_num_sg, struct ib_udata *udata)
+			       u32 max_num_sg)
 {
 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
 	struct mlx4_ib_mr *mr;
@@ -697,99 +686,6 @@
 	return ERR_PTR(err);
 }
 
-struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc,
-				 struct ib_fmr_attr *fmr_attr)
-{
-	struct mlx4_ib_dev *dev = to_mdev(pd->device);
-	struct mlx4_ib_fmr *fmr;
-	int err = -ENOMEM;
-
-	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
-	if (!fmr)
-		return ERR_PTR(-ENOMEM);
-
-	err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc),
-			     fmr_attr->max_pages, fmr_attr->max_maps,
-			     fmr_attr->page_shift, &fmr->mfmr);
-	if (err)
-		goto err_free;
-
-	err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr);
-	if (err)
-		goto err_mr;
-
-	fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key;
-
-	return &fmr->ibfmr;
-
-err_mr:
-	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr);
-
-err_free:
-	kfree(fmr);
-
-	return ERR_PTR(err);
-}
-
-int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
-		      int npages, u64 iova)
-{
-	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
-	struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device);
-
-	return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova,
-				 &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
-}
-
-int mlx4_ib_unmap_fmr(struct list_head *fmr_list)
-{
-	struct ib_fmr *ibfmr;
-	int err;
-	struct mlx4_dev *mdev = NULL;
-
-	list_for_each_entry(ibfmr, fmr_list, list) {
-		if (mdev && to_mdev(ibfmr->device)->dev != mdev)
-			return -EINVAL;
-		mdev = to_mdev(ibfmr->device)->dev;
-	}
-
-	if (!mdev)
-		return 0;
-
-	list_for_each_entry(ibfmr, fmr_list, list) {
-		struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
-
-		mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
-	}
-
-	/*
-	 * Make sure all MPT status updates are visible before issuing
-	 * SYNC_TPT firmware command.
-	 */
-	wmb();
-
-	err = mlx4_SYNC_TPT(mdev);
-	if (err)
-		pr_warn("SYNC_TPT error %d when "
-		       "unmapping FMRs\n", err);
-
-	return 0;
-}
-
-int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr)
-{
-	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
-	struct mlx4_ib_dev *dev = to_mdev(ibfmr->device);
-	int err;
-
-	err = mlx4_fmr_free(dev->dev, &ifmr->mfmr);
-
-	if (!err)
-		kfree(ifmr);
-
-	return err;
-}
-
 static int mlx4_set_page(struct ib_mr *ibmr, u64 addr)
 {
 	struct mlx4_ib_mr *mr = to_mmr(ibmr);
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 17ce928..c6a815a 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -68,27 +68,6 @@
 };
 
 enum {
-	/*
-	 * Largest possible UD header: send with GRH and immediate
-	 * data plus 18 bytes for an Ethernet header with VLAN/802.1Q
-	 * tag.  (LRH would only use 8 bytes, so Ethernet is the
-	 * biggest case)
-	 */
-	MLX4_IB_UD_HEADER_SIZE		= 82,
-	MLX4_IB_LSO_HEADER_SPARE	= 128,
-};
-
-struct mlx4_ib_sqp {
-	struct mlx4_ib_qp	qp;
-	int			pkey_index;
-	u32			qkey;
-	u32			send_psn;
-	struct ib_ud_header	ud_header;
-	u8			header_buf[MLX4_IB_UD_HEADER_SIZE];
-	struct ib_qp		*roce_v2_gsi;
-};
-
-enum {
 	MLX4_IB_MIN_SQ_STRIDE	= 6,
 	MLX4_IB_CACHE_LINE_SIZE	= 64,
 };
@@ -123,11 +102,6 @@
 	MLX4_IB_RWQ_SRC	= 1,
 };
 
-static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
-{
-	return container_of(mqp, struct mlx4_ib_sqp, qp);
-}
-
 static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 {
 	if (!mlx4_is_master(dev->dev))
@@ -656,8 +630,6 @@
 	if (err)
 		goto err_qpn;
 
-	mutex_init(&qp->mutex);
-
 	INIT_LIST_HEAD(&qp->gid_list);
 	INIT_LIST_HEAD(&qp->steering_rules);
 
@@ -696,80 +668,72 @@
 	return err;
 }
 
-static struct ib_qp *_mlx4_ib_create_qp_rss(struct ib_pd *pd,
-					    struct ib_qp_init_attr *init_attr,
-					    struct ib_udata *udata)
+static int _mlx4_ib_create_qp_rss(struct ib_pd *pd, struct mlx4_ib_qp *qp,
+				  struct ib_qp_init_attr *init_attr,
+				  struct ib_udata *udata)
 {
-	struct mlx4_ib_qp *qp;
 	struct mlx4_ib_create_qp_rss ucmd = {};
 	size_t required_cmd_sz;
 	int err;
 
 	if (!udata) {
 		pr_debug("RSS QP with NULL udata\n");
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	if (udata->outlen)
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
 	required_cmd_sz = offsetof(typeof(ucmd), reserved1) +
 					sizeof(ucmd.reserved1);
 	if (udata->inlen < required_cmd_sz) {
 		pr_debug("invalid inlen\n");
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
 		pr_debug("copy failed\n");
-		return ERR_PTR(-EFAULT);
+		return -EFAULT;
 	}
 
 	if (memchr_inv(ucmd.reserved, 0, sizeof(ucmd.reserved)))
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
 	if (ucmd.comp_mask || ucmd.reserved1)
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
 	if (udata->inlen > sizeof(ucmd) &&
 	    !ib_is_udata_cleared(udata, sizeof(ucmd),
 				 udata->inlen - sizeof(ucmd))) {
 		pr_debug("inlen is not supported\n");
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 	}
 
 	if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
 		pr_debug("RSS QP with unsupported QP type %d\n",
 			 init_attr->qp_type);
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 	}
 
 	if (init_attr->create_flags) {
 		pr_debug("RSS QP doesn't support create flags\n");
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 	}
 
 	if (init_attr->send_cq || init_attr->cap.max_send_wr) {
 		pr_debug("RSS QP with unsupported send attributes\n");
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 	}
 
-	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
-	if (!qp)
-		return ERR_PTR(-ENOMEM);
-
 	qp->pri.vid = 0xFFFF;
 	qp->alt.vid = 0xFFFF;
 
 	err = create_qp_rss(to_mdev(pd->device), init_attr, &ucmd, qp);
-	if (err) {
-		kfree(qp);
-		return ERR_PTR(err);
-	}
+	if (err)
+		return err;
 
 	qp->ibqp.qp_num = qp->mqp.qpn;
-
-	return &qp->ibqp;
+	return 0;
 }
 
 /*
@@ -849,7 +813,7 @@
 	 * reused for further WQN allocations.
 	 * The next created WQ will allocate a new range.
 	 */
-		range->dirty = 1;
+		range->dirty = true;
 	}
 
 	mutex_unlock(&context->wqn_ranges_mutex);
@@ -873,7 +837,6 @@
 
 	qp->mlx4_ib_qp_type = MLX4_IB_QPT_RAW_PACKET;
 
-	mutex_init(&qp->mutex);
 	spin_lock_init(&qp->sq.lock);
 	spin_lock_init(&qp->rq.lock);
 	INIT_LIST_HEAD(&qp->gid_list);
@@ -916,13 +879,12 @@
 	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
 		       (qp->sq.wqe_cnt << qp->sq.wqe_shift);
 
-	qp->umem = ib_umem_get(udata, wq.buf_addr, qp->buf_size, 0, 0);
+	qp->umem = ib_umem_get(pd->device, wq.buf_addr, qp->buf_size, 0);
 	if (IS_ERR(qp->umem)) {
 		err = PTR_ERR(qp->umem);
 		goto err;
 	}
 
-	n = ib_umem_page_count(qp->umem);
 	shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n);
 	err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt);
 
@@ -989,13 +951,11 @@
 
 static int create_qp_common(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
 			    struct ib_udata *udata, int sqpn,
-			    struct mlx4_ib_qp **caller_qp)
+			    struct mlx4_ib_qp *qp)
 {
 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
 	int qpn;
 	int err;
-	struct mlx4_ib_sqp *sqp = NULL;
-	struct mlx4_ib_qp *qp;
 	struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
 		udata, struct mlx4_ib_ucontext, ibucontext);
 	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
@@ -1043,27 +1003,18 @@
 		sqpn = qpn;
 	}
 
-	if (!*caller_qp) {
-		if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI ||
-		    (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER |
-				MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) {
-			sqp = kzalloc(sizeof(struct mlx4_ib_sqp), GFP_KERNEL);
-			if (!sqp)
-				return -ENOMEM;
-			qp = &sqp->qp;
-		} else {
-			qp = kzalloc(sizeof(struct mlx4_ib_qp), GFP_KERNEL);
-			if (!qp)
-				return -ENOMEM;
-		}
-		qp->pri.vid = 0xFFFF;
-		qp->alt.vid = 0xFFFF;
-	} else
-		qp = *caller_qp;
+	if (init_attr->qp_type == IB_QPT_SMI ||
+	    init_attr->qp_type == IB_QPT_GSI || qp_type == MLX4_IB_QPT_SMI ||
+	    qp_type == MLX4_IB_QPT_GSI ||
+	    (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER |
+			MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) {
+		qp->sqp = kzalloc(sizeof(struct mlx4_ib_sqp), GFP_KERNEL);
+		if (!qp->sqp)
+			return -ENOMEM;
+	}
 
 	qp->mlx4_ib_qp_type = qp_type;
 
-	mutex_init(&qp->mutex);
 	spin_lock_init(&qp->sq.lock);
 	spin_lock_init(&qp->rq.lock);
 	INIT_LIST_HEAD(&qp->gid_list);
@@ -1111,13 +1062,12 @@
 			goto err;
 
 		qp->umem =
-			ib_umem_get(udata, ucmd.buf_addr, qp->buf_size, 0, 0);
+			ib_umem_get(pd->device, ucmd.buf_addr, qp->buf_size, 0);
 		if (IS_ERR(qp->umem)) {
 			err = PTR_ERR(qp->umem);
 			goto err;
 		}
 
-		n = ib_umem_page_count(qp->umem);
 		shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n);
 		err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt);
 
@@ -1149,8 +1099,10 @@
 			if (dev->steering_support ==
 			    MLX4_STEERING_MODE_DEVICE_MANAGED)
 				qp->flags |= MLX4_IB_QP_NETIF;
-			else
+			else {
+				err = -EINVAL;
 				goto err;
+			}
 		}
 
 		err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
@@ -1239,9 +1191,6 @@
 
 	qp->mqp.event = mlx4_ib_qp_event;
 
-	if (!*caller_qp)
-		*caller_qp = qp;
-
 	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
 	mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq),
 			 to_mcq(init_attr->recv_cq));
@@ -1293,10 +1242,7 @@
 		mlx4_db_free(dev->dev, &qp->db);
 
 err:
-	if (!sqp && !*caller_qp)
-		kfree(qp);
-	kfree(sqp);
-
+	kfree(qp->sqp);
 	return err;
 }
 
@@ -1410,7 +1356,6 @@
 	mlx4_qp_free(dev->dev, &qp->mqp);
 	mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
 	del_gid_entries(qp);
-	kfree(qp->rss_ctx);
 }
 
 static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
@@ -1529,17 +1474,16 @@
 		return dev->dev->caps.spec_qps[attr->port_num - 1].qp1_proxy;
 }
 
-static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd,
-					struct ib_qp_init_attr *init_attr,
-					struct ib_udata *udata)
+static int _mlx4_ib_create_qp(struct ib_pd *pd, struct mlx4_ib_qp *qp,
+			      struct ib_qp_init_attr *init_attr,
+			      struct ib_udata *udata)
 {
-	struct mlx4_ib_qp *qp = NULL;
 	int err;
 	int sup_u_create_flags = MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
 	u16 xrcdn = 0;
 
 	if (init_attr->rwq_ind_tbl)
-		return _mlx4_ib_create_qp_rss(pd, init_attr, udata);
+		return _mlx4_ib_create_qp_rss(pd, qp, init_attr, udata);
 
 	/*
 	 * We only support LSO, vendor flag1, and multicast loopback blocking,
@@ -1551,16 +1495,16 @@
 					MLX4_IB_SRIOV_SQP |
 					MLX4_IB_QP_NETIF |
 					MLX4_IB_QP_CREATE_ROCE_V2_GSI))
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
 		if (init_attr->qp_type != IB_QPT_UD)
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 	}
 
 	if (init_attr->create_flags) {
 		if (udata && init_attr->create_flags & ~(sup_u_create_flags))
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 
 		if ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP |
 						 MLX4_IB_QP_CREATE_ROCE_V2_GSI  |
@@ -1570,7 +1514,7 @@
 		     init_attr->qp_type > IB_QPT_GSI) ||
 		    (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI &&
 		     init_attr->qp_type != IB_QPT_GSI))
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 	}
 
 	switch (init_attr->qp_type) {
@@ -1578,56 +1522,46 @@
 		pd = to_mxrcd(init_attr->xrcd)->pd;
 		xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
 		init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq;
-		/* fall through */
+		fallthrough;
 	case IB_QPT_XRC_INI:
 		if (!(to_mdev(pd->device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
-			return ERR_PTR(-ENOSYS);
+			return -ENOSYS;
 		init_attr->recv_cq = init_attr->send_cq;
-		/* fall through */
+		fallthrough;
 	case IB_QPT_RC:
 	case IB_QPT_UC:
 	case IB_QPT_RAW_PACKET:
-		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
-		if (!qp)
-			return ERR_PTR(-ENOMEM);
+	case IB_QPT_UD:
 		qp->pri.vid = 0xFFFF;
 		qp->alt.vid = 0xFFFF;
-		/* fall through */
-	case IB_QPT_UD:
-	{
-		err = create_qp_common(pd, init_attr, udata, 0, &qp);
-		if (err) {
-			kfree(qp);
-			return ERR_PTR(err);
-		}
+		err = create_qp_common(pd, init_attr, udata, 0, qp);
+		if (err)
+			return err;
 
 		qp->ibqp.qp_num = qp->mqp.qpn;
 		qp->xrcdn = xrcdn;
-
 		break;
-	}
 	case IB_QPT_SMI:
 	case IB_QPT_GSI:
 	{
 		int sqpn;
 
-		/* Userspace is not allowed to create special QPs: */
-		if (udata)
-			return ERR_PTR(-EINVAL);
 		if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) {
 			int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev,
 							1, 1, &sqpn, 0,
 							MLX4_RES_USAGE_DRIVER);
 
 			if (res)
-				return ERR_PTR(res);
+				return res;
 		} else {
 			sqpn = get_sqp_num(to_mdev(pd->device), init_attr);
 		}
 
-		err = create_qp_common(pd, init_attr, udata, sqpn, &qp);
+		qp->pri.vid = 0xFFFF;
+		qp->alt.vid = 0xFFFF;
+		err = create_qp_common(pd, init_attr, udata, sqpn, qp);
 		if (err)
-			return ERR_PTR(err);
+			return err;
 
 		qp->port	= init_attr->port_num;
 		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 :
@@ -1636,25 +1570,33 @@
 	}
 	default:
 		/* Don't support raw QPs */
-		return ERR_PTR(-EINVAL);
+		return -EOPNOTSUPP;
 	}
-
-	return &qp->ibqp;
+	return 0;
 }
 
 struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
 				struct ib_qp_init_attr *init_attr,
 				struct ib_udata *udata) {
 	struct ib_device *device = pd ? pd->device : init_attr->xrcd->device;
-	struct ib_qp *ibqp;
 	struct mlx4_ib_dev *dev = to_mdev(device);
+	struct mlx4_ib_qp *qp;
+	int ret;
 
-	ibqp = _mlx4_ib_create_qp(pd, init_attr, udata);
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
 
-	if (!IS_ERR(ibqp) &&
-	    (init_attr->qp_type == IB_QPT_GSI) &&
+	mutex_init(&qp->mutex);
+	ret = _mlx4_ib_create_qp(pd, qp, init_attr, udata);
+	if (ret) {
+		kfree(qp);
+		return ERR_PTR(ret);
+	}
+
+	if (init_attr->qp_type == IB_QPT_GSI &&
 	    !(init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI)) {
-		struct mlx4_ib_sqp *sqp = to_msqp((to_mqp(ibqp)));
+		struct mlx4_ib_sqp *sqp = qp->sqp;
 		int is_eth = rdma_cap_eth_ah(&dev->ib_dev, init_attr->port_num);
 
 		if (is_eth &&
@@ -1666,14 +1608,14 @@
 				pr_err("Failed to create GSI QP for RoCEv2 (%ld)\n", PTR_ERR(sqp->roce_v2_gsi));
 				sqp->roce_v2_gsi = NULL;
 			} else {
-				sqp = to_msqp(to_mqp(sqp->roce_v2_gsi));
-				sqp->qp.flags |= MLX4_IB_ROCE_V2_GSI_QP;
+				to_mqp(sqp->roce_v2_gsi)->flags |=
+					MLX4_IB_ROCE_V2_GSI_QP;
 			}
 
 			init_attr->create_flags &= ~MLX4_IB_QP_CREATE_ROCE_V2_GSI;
 		}
 	}
-	return ibqp;
+	return &qp->ibqp;
 }
 
 static int _mlx4_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
@@ -1700,10 +1642,8 @@
 		destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, udata);
 	}
 
-	if (is_sqp(dev, mqp))
-		kfree(to_msqp(mqp));
-	else
-		kfree(mqp);
+	kfree(mqp->sqp);
+	kfree(mqp);
 
 	return 0;
 }
@@ -1713,7 +1653,7 @@
 	struct mlx4_ib_qp *mqp = to_mqp(qp);
 
 	if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
-		struct mlx4_ib_sqp *sqp = to_msqp(mqp);
+		struct mlx4_ib_sqp *sqp = mqp->sqp;
 
 		if (sqp->roce_v2_gsi)
 			ib_destroy_qp(sqp->roce_v2_gsi);
@@ -2575,7 +2515,7 @@
 		qp->alt_port = attr->alt_port_num;
 
 	if (is_sqp(dev, qp))
-		store_sqp_attrs(to_msqp(qp), attr, attr_mask);
+		store_sqp_attrs(qp->sqp, attr, attr_mask);
 
 	/*
 	 * If we moved QP0 to RTR, bring the IB link up; if we moved
@@ -2852,7 +2792,7 @@
 	ret = _mlx4_ib_modify_qp(ibqp, attr, attr_mask, udata);
 
 	if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
-		struct mlx4_ib_sqp *sqp = to_msqp(mqp);
+		struct mlx4_ib_sqp *sqp = mqp->sqp;
 		int err = 0;
 
 		if (sqp->roce_v2_gsi)
@@ -2877,12 +2817,13 @@
 	return -EINVAL;
 }
 
-static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
+static int build_sriov_qp0_header(struct mlx4_ib_qp *qp,
 				  const struct ib_ud_wr *wr,
 				  void *wqe, unsigned *mlx_seg_len)
 {
-	struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device);
-	struct ib_device *ib_dev = &mdev->ib_dev;
+	struct mlx4_ib_dev *mdev = to_mdev(qp->ibqp.device);
+	struct mlx4_ib_sqp *sqp = qp->sqp;
+	struct ib_device *ib_dev = qp->ibqp.device;
 	struct mlx4_wqe_mlx_seg *mlx = wqe;
 	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
 	struct mlx4_ib_ah *ah = to_mah(wr->ah);
@@ -2904,12 +2845,12 @@
 
 	/* for proxy-qp0 sends, need to add in size of tunnel header */
 	/* for tunnel-qp0 sends, tunnel header is already in s/g list */
-	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
+	if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
 		send_size += sizeof (struct mlx4_ib_tunnel_header);
 
 	ib_ud_header_init(send_size, 1, 0, 0, 0, 0, 0, 0, &sqp->ud_header);
 
-	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
+	if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
 		sqp->ud_header.lrh.service_level =
 			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
 		sqp->ud_header.lrh.destination_lid =
@@ -2926,26 +2867,26 @@
 
 	sqp->ud_header.lrh.virtual_lane    = 0;
 	sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED);
-	err = ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey);
+	err = ib_get_cached_pkey(ib_dev, qp->port, 0, &pkey);
 	if (err)
 		return err;
 	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
-	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER)
+	if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER)
 		sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn);
 	else
 		sqp->ud_header.bth.destination_qpn =
-			cpu_to_be32(mdev->dev->caps.spec_qps[sqp->qp.port - 1].qp0_tunnel);
+			cpu_to_be32(mdev->dev->caps.spec_qps[qp->port - 1].qp0_tunnel);
 
 	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
 	if (mlx4_is_master(mdev->dev)) {
-		if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
+		if (mlx4_get_parav_qkey(mdev->dev, qp->mqp.qpn, &qkey))
 			return -EINVAL;
 	} else {
-		if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
+		if (vf_get_qp0_qkey(mdev->dev, qp->mqp.qpn, &qkey))
 			return -EINVAL;
 	}
 	sqp->ud_header.deth.qkey = cpu_to_be32(qkey);
-	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(qp->mqp.qpn);
 
 	sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY;
 	sqp->ud_header.immediate_present = 0;
@@ -3029,10 +2970,11 @@
 }
 
 #define MLX4_ROCEV2_QP1_SPORT 0xC000
-static int build_mlx_header(struct mlx4_ib_sqp *sqp, const struct ib_ud_wr *wr,
+static int build_mlx_header(struct mlx4_ib_qp *qp, const struct ib_ud_wr *wr,
 			    void *wqe, unsigned *mlx_seg_len)
 {
-	struct ib_device *ib_dev = sqp->qp.ibqp.device;
+	struct mlx4_ib_sqp *sqp = qp->sqp;
+	struct ib_device *ib_dev = qp->ibqp.device;
 	struct mlx4_ib_dev *ibdev = to_mdev(ib_dev);
 	struct mlx4_wqe_mlx_seg *mlx = wqe;
 	struct mlx4_wqe_ctrl_seg *ctrl = wqe;
@@ -3056,7 +2998,7 @@
 	for (i = 0; i < wr->wr.num_sge; ++i)
 		send_size += wr->wr.sg_list[i].length;
 
-	is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
+	is_eth = rdma_port_get_link_layer(qp->ibqp.device, qp->port) == IB_LINK_LAYER_ETHERNET;
 	is_grh = mlx4_ib_ah_grh_present(ah);
 	if (is_eth) {
 		enum ib_gid_type gid_type;
@@ -3070,9 +3012,9 @@
 			if (err)
 				return err;
 		} else  {
-			err = fill_gid_by_hw_index(ibdev, sqp->qp.port,
-					    ah->av.ib.gid_index,
-					    &sgid, &gid_type);
+			err = fill_gid_by_hw_index(ibdev, qp->port,
+						   ah->av.ib.gid_index, &sgid,
+						   &gid_type);
 			if (!err) {
 				is_udp = gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP;
 				if (is_udp) {
@@ -3088,7 +3030,7 @@
 		}
 		if (ah->av.eth.vlan != cpu_to_be16(0xffff)) {
 			vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff;
-			is_vlan = 1;
+			is_vlan = true;
 		}
 	}
 	err = ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh,
@@ -3117,13 +3059,18 @@
 				 * indexes don't necessarily match the hw ones, so
 				 * we must use our own cache
 				 */
-				sqp->ud_header.grh.source_gid.global.subnet_prefix =
-					cpu_to_be64(atomic64_read(&(to_mdev(ib_dev)->sriov.
-								    demux[sqp->qp.port - 1].
-								    subnet_prefix)));
-				sqp->ud_header.grh.source_gid.global.interface_id =
-					to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
-						       guid_cache[ah->av.ib.gid_index];
+				sqp->ud_header.grh.source_gid.global
+					.subnet_prefix =
+					cpu_to_be64(atomic64_read(
+						&(to_mdev(ib_dev)
+							  ->sriov
+							  .demux[qp->port - 1]
+							  .subnet_prefix)));
+				sqp->ud_header.grh.source_gid.global
+					.interface_id =
+					to_mdev(ib_dev)
+						->sriov.demux[qp->port - 1]
+						.guid_cache[ah->av.ib.gid_index];
 			} else {
 				sqp->ud_header.grh.source_gid =
 					ah->ibah.sgid_attr->gid;
@@ -3155,10 +3102,13 @@
 	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
 
 	if (!is_eth) {
-		mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
-					  (sqp->ud_header.lrh.destination_lid ==
-					   IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
-					  (sqp->ud_header.lrh.service_level << 8));
+		mlx->flags |=
+			cpu_to_be32((!qp->ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
+				    (sqp->ud_header.lrh.destination_lid ==
+						     IB_LID_PERMISSIVE ?
+					     MLX4_WQE_MLX_SLR :
+					     0) |
+				    (sqp->ud_header.lrh.service_level << 8));
 		if (ah->av.ib.port_pd & cpu_to_be32(0x80000000))
 			mlx->flags |= cpu_to_be32(0x1); /* force loopback */
 		mlx->rlid = sqp->ud_header.lrh.destination_lid;
@@ -3204,21 +3154,23 @@
 			sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
 		}
 	} else {
-		sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 :
-							sl_to_vl(to_mdev(ib_dev),
-								 sqp->ud_header.lrh.service_level,
-								 sqp->qp.port);
-		if (sqp->qp.ibqp.qp_num && sqp->ud_header.lrh.virtual_lane == 15)
+		sqp->ud_header.lrh.virtual_lane =
+			!qp->ibqp.qp_num ?
+				15 :
+				sl_to_vl(to_mdev(ib_dev),
+					 sqp->ud_header.lrh.service_level,
+					 qp->port);
+		if (qp->ibqp.qp_num && sqp->ud_header.lrh.virtual_lane == 15)
 			return -EINVAL;
 		if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
 			sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
 	}
 	sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED);
-	if (!sqp->qp.ibqp.qp_num)
-		err = ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index,
+	if (!qp->ibqp.qp_num)
+		err = ib_get_cached_pkey(ib_dev, qp->port, sqp->pkey_index,
 					 &pkey);
 	else
-		err = ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->pkey_index,
+		err = ib_get_cached_pkey(ib_dev, qp->port, wr->pkey_index,
 					 &pkey);
 	if (err)
 		return err;
@@ -3228,7 +3180,7 @@
 	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
 	sqp->ud_header.deth.qkey = cpu_to_be32(wr->remote_qkey & 0x80000000 ?
 					       sqp->qkey : wr->remote_qkey);
-	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(qp->ibqp.qp_num);
 
 	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
 
@@ -3541,24 +3493,24 @@
 	int nreq;
 	int err = 0;
 	unsigned ind;
-	int uninitialized_var(size);
-	unsigned uninitialized_var(seglen);
+	int size;
+	unsigned seglen;
 	__be32 dummy;
 	__be32 *lso_wqe;
-	__be32 uninitialized_var(lso_hdr_sz);
+	__be32 lso_hdr_sz;
 	__be32 blh;
 	int i;
 	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
 
 	if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) {
-		struct mlx4_ib_sqp *sqp = to_msqp(qp);
+		struct mlx4_ib_sqp *sqp = qp->sqp;
 
 		if (sqp->roce_v2_gsi) {
 			struct mlx4_ib_ah *ah = to_mah(ud_wr(wr)->ah);
 			enum ib_gid_type gid_type;
 			union ib_gid gid;
 
-			if (!fill_gid_by_hw_index(mdev, sqp->qp.port,
+			if (!fill_gid_by_hw_index(mdev, qp->port,
 					   ah->av.ib.gid_index,
 					   &gid, &gid_type))
 				qp = (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ?
@@ -3678,8 +3630,8 @@
 			break;
 
 		case MLX4_IB_QPT_TUN_SMI_OWNER:
-			err =  build_sriov_qp0_header(to_msqp(qp), ud_wr(wr),
-					ctrl, &seglen);
+			err = build_sriov_qp0_header(qp, ud_wr(wr), ctrl,
+						     &seglen);
 			if (unlikely(err)) {
 				*bad_wr = wr;
 				goto out;
@@ -3715,8 +3667,8 @@
 			break;
 
 		case MLX4_IB_QPT_PROXY_SMI_OWNER:
-			err = build_sriov_qp0_header(to_msqp(qp), ud_wr(wr),
-					ctrl, &seglen);
+			err = build_sriov_qp0_header(qp, ud_wr(wr), ctrl,
+						     &seglen);
 			if (unlikely(err)) {
 				*bad_wr = wr;
 				goto out;
@@ -3749,8 +3701,7 @@
 
 		case MLX4_IB_QPT_SMI:
 		case MLX4_IB_QPT_GSI:
-			err = build_mlx_header(to_msqp(qp), ud_wr(wr), ctrl,
-					&seglen);
+			err = build_mlx_header(qp, ud_wr(wr), ctrl, &seglen);
 			if (unlikely(err)) {
 				*bad_wr = wr;
 				goto out;
@@ -4172,6 +4123,7 @@
 	if (!qp)
 		return ERR_PTR(-ENOMEM);
 
+	mutex_init(&qp->mutex);
 	qp->pri.vid = 0xFFFF;
 	qp->alt.vid = 0xFFFF;
 
@@ -4322,7 +4274,7 @@
 	return err;
 }
 
-void mlx4_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata)
+int mlx4_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(ibwq->device);
 	struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
@@ -4333,36 +4285,35 @@
 	destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, udata);
 
 	kfree(qp);
+	return 0;
 }
 
-struct ib_rwq_ind_table
-*mlx4_ib_create_rwq_ind_table(struct ib_device *device,
-			      struct ib_rwq_ind_table_init_attr *init_attr,
-			      struct ib_udata *udata)
+int mlx4_ib_create_rwq_ind_table(struct ib_rwq_ind_table *rwq_ind_table,
+				 struct ib_rwq_ind_table_init_attr *init_attr,
+				 struct ib_udata *udata)
 {
-	struct ib_rwq_ind_table *rwq_ind_table;
 	struct mlx4_ib_create_rwq_ind_tbl_resp resp = {};
 	unsigned int ind_tbl_size = 1 << init_attr->log_ind_tbl_size;
+	struct ib_device *device = rwq_ind_table->device;
 	unsigned int base_wqn;
 	size_t min_resp_len;
-	int i;
-	int err;
+	int i, err = 0;
 
 	if (udata->inlen > 0 &&
 	    !ib_is_udata_cleared(udata, 0,
 				 udata->inlen))
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
 	min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
 	if (udata->outlen && udata->outlen < min_resp_len)
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 
 	if (ind_tbl_size >
 	    device->attrs.rss_caps.max_rwq_indirection_table_size) {
 		pr_debug("log_ind_tbl_size = %d is bigger than supported = %d\n",
 			 ind_tbl_size,
 			 device->attrs.rss_caps.max_rwq_indirection_table_size);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	base_wqn = init_attr->ind_tbl[0]->wq_num;
@@ -4370,39 +4321,23 @@
 	if (base_wqn % ind_tbl_size) {
 		pr_debug("WQN=0x%x isn't aligned with indirection table size\n",
 			 base_wqn);
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
 	for (i = 1; i < ind_tbl_size; i++) {
 		if (++base_wqn != init_attr->ind_tbl[i]->wq_num) {
 			pr_debug("indirection table's WQNs aren't consecutive\n");
-			return ERR_PTR(-EINVAL);
+			return -EINVAL;
 		}
 	}
 
-	rwq_ind_table = kzalloc(sizeof(*rwq_ind_table), GFP_KERNEL);
-	if (!rwq_ind_table)
-		return ERR_PTR(-ENOMEM);
-
 	if (udata->outlen) {
 		resp.response_length = offsetof(typeof(resp), response_length) +
 					sizeof(resp.response_length);
 		err = ib_copy_to_udata(udata, &resp, resp.response_length);
-		if (err)
-			goto err;
 	}
 
-	return rwq_ind_table;
-
-err:
-	kfree(rwq_ind_table);
-	return ERR_PTR(err);
-}
-
-int mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
-{
-	kfree(ib_rwq_ind_tbl);
-	return 0;
+	return err;
 }
 
 struct mlx4_ib_drain_cqe {
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index 848db72..bf61852 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -110,12 +110,14 @@
 		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
 			return -EFAULT;
 
-		srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0, 0);
+		srq->umem =
+			ib_umem_get(ib_srq->device, ucmd.buf_addr, buf_size, 0);
 		if (IS_ERR(srq->umem))
 			return PTR_ERR(srq->umem);
 
-		err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem),
-				    PAGE_SHIFT, &srq->mtt);
+		err = mlx4_mtt_init(
+			dev->dev, ib_umem_num_dma_blocks(srq->umem, PAGE_SIZE),
+			PAGE_SHIFT, &srq->mtt);
 		if (err)
 			goto err_buf;
 
@@ -259,7 +261,7 @@
 	return 0;
 }
 
-void mlx4_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
+int mlx4_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
 {
 	struct mlx4_ib_dev *dev = to_mdev(srq->device);
 	struct mlx4_ib_srq *msrq = to_msrq(srq);
@@ -281,6 +283,7 @@
 		mlx4_db_free(dev->dev, &msrq->db);
 	}
 	ib_umem_release(msrq->umem);
+	return 0;
 }
 
 void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index)
diff --git a/drivers/infiniband/hw/mlx5/Kconfig b/drivers/infiniband/hw/mlx5/Kconfig
index ea248de..ef1ff42 100644
--- a/drivers/infiniband/hw/mlx5/Kconfig
+++ b/drivers/infiniband/hw/mlx5/Kconfig
@@ -2,7 +2,7 @@
 config MLX5_INFINIBAND
 	tristate "Mellanox 5th generation network adapters (ConnectX series) support"
 	depends on NETDEVICES && ETHERNET && PCI && MLX5_CORE
-	---help---
+	help
 	  This driver provides low-level InfiniBand support for
 	  Mellanox Connect-IB PCI Express host channel adapters (HCAs).
 	  This is required to use InfiniBand protocols such as
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
index 9924be8..b4c009b 100644
--- a/drivers/infiniband/hw/mlx5/Makefile
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -1,10 +1,28 @@
 # SPDX-License-Identifier: GPL-2.0-only
-obj-$(CONFIG_MLX5_INFINIBAND)	+= mlx5_ib.o
+obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o
 
-mlx5_ib-y :=	main.o cq.o doorbell.o qp.o mem.o srq_cmd.o \
-		srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o \
-		cong.o
+mlx5_ib-y := ah.o \
+	     cmd.o \
+	     cong.o \
+	     counters.o \
+	     cq.o \
+	     doorbell.o \
+	     gsi.o \
+	     ib_virt.o \
+	     mad.o \
+	     main.o \
+	     mem.o \
+	     mr.o \
+	     qp.o \
+	     qpc.o \
+	     restrack.o \
+	     srq.o \
+	     srq_cmd.o \
+	     wr.o
+
 mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o
 mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o
-mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o
-mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += flow.o
+mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o \
+					    fs.o \
+					    qos.o \
+					    std_types.o
diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c
index 80642dd..505bc47 100644
--- a/drivers/infiniband/hw/mlx5/ah.c
+++ b/drivers/infiniband/hw/mlx5/ah.c
@@ -32,9 +32,28 @@
 
 #include "mlx5_ib.h"
 
-static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah,
-			 struct rdma_ah_attr *ah_attr)
+static __be16 mlx5_ah_get_udp_sport(const struct mlx5_ib_dev *dev,
+				  const struct rdma_ah_attr *ah_attr)
 {
+	enum ib_gid_type gid_type = ah_attr->grh.sgid_attr->gid_type;
+	__be16 sport;
+
+	if ((gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) &&
+	    (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) &&
+	    (ah_attr->grh.flow_label & IB_GRH_FLOWLABEL_MASK))
+		sport = cpu_to_be16(
+			rdma_flow_label_to_udp_sport(ah_attr->grh.flow_label));
+	else
+		sport = mlx5_get_roce_udp_sport_min(dev,
+						    ah_attr->grh.sgid_attr);
+
+	return sport;
+}
+
+static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah,
+			 struct rdma_ah_init_attr *init_attr)
+{
+	struct rdma_ah_attr *ah_attr = init_attr->ah_attr;
 	enum ib_gid_type gid_type;
 
 	if (rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH) {
@@ -51,12 +70,15 @@
 	ah->av.stat_rate_sl = (rdma_ah_get_static_rate(ah_attr) << 4);
 
 	if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
+		if (init_attr->xmit_slave)
+			ah->xmit_port =
+				mlx5_lag_get_slave_port(dev->mdev,
+							init_attr->xmit_slave);
 		gid_type = ah_attr->grh.sgid_attr->gid_type;
 
 		memcpy(ah->av.rmac, ah_attr->roce.dmac,
 		       sizeof(ah_attr->roce.dmac));
-		ah->av.udp_sport =
-			mlx5_get_roce_udp_sport(dev, ah_attr->grh.sgid_attr);
+		ah->av.udp_sport = mlx5_ah_get_udp_sport(dev, ah_attr);
 		ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0x7) << 1;
 		if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
 #define MLX5_ECN_ENABLED BIT(1)
@@ -68,10 +90,11 @@
 	}
 }
 
-int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
-		      u32 flags, struct ib_udata *udata)
+int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
+		      struct ib_udata *udata)
 
 {
+	struct rdma_ah_attr *ah_attr = init_attr->ah_attr;
 	struct mlx5_ib_ah *ah = to_mah(ibah);
 	struct mlx5_ib_dev *dev = to_mdev(ibah->device);
 	enum rdma_ah_attr_type ah_type = ah_attr->type;
@@ -83,8 +106,8 @@
 	if (ah_type == RDMA_AH_ATTR_TYPE_ROCE && udata) {
 		int err;
 		struct mlx5_ib_create_ah_resp resp = {};
-		u32 min_resp_len = offsetof(typeof(resp), dmac) +
-				   sizeof(resp.dmac);
+		u32 min_resp_len =
+			offsetofend(struct mlx5_ib_create_ah_resp, dmac);
 
 		if (udata->outlen < min_resp_len)
 			return -EINVAL;
@@ -97,7 +120,7 @@
 			return err;
 	}
 
-	create_ib_ah(dev, ah, ah_attr);
+	create_ib_ah(dev, ah, init_attr);
 	return 0;
 }
 
@@ -124,8 +147,3 @@
 
 	return 0;
 }
-
-void mlx5_ib_destroy_ah(struct ib_ah *ah, u32 flags)
-{
-	return;
-}
diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c
index 4937947..234f299 100644
--- a/drivers/infiniband/hw/mlx5/cmd.c
+++ b/drivers/infiniband/hw/mlx5/cmd.c
@@ -1,46 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright (c) 2017, Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright (c) 2017-2020, Mellanox Technologies inc. All rights reserved.
  */
 
 #include "cmd.h"
 
 int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey)
 {
-	u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {0};
-	u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)]   = {0};
+	u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {};
 	int err;
 
 	MLX5_SET(query_special_contexts_in, in, opcode,
 		 MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS);
-	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	err = mlx5_cmd_exec_inout(dev, query_special_contexts, in, out);
 	if (!err)
 		*mkey = MLX5_GET(query_special_contexts_out, out,
 				 dump_fill_mkey);
@@ -50,12 +23,12 @@
 int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey)
 {
 	u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {};
-	u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)]   = {};
+	u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {};
 	int err;
 
 	MLX5_SET(query_special_contexts_in, in, opcode,
 		 MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS);
-	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	err = mlx5_cmd_exec_inout(dev, query_special_contexts, in, out);
 	if (!err)
 		*null_mkey = MLX5_GET(query_special_contexts_out, out,
 				      null_mkey);
@@ -63,23 +36,15 @@
 }
 
 int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
-			       void *out, int out_size)
+			       void *out)
 {
-	u32 in[MLX5_ST_SZ_DW(query_cong_params_in)] = { };
+	u32 in[MLX5_ST_SZ_DW(query_cong_params_in)] = {};
 
 	MLX5_SET(query_cong_params_in, in, opcode,
 		 MLX5_CMD_OP_QUERY_CONG_PARAMS);
 	MLX5_SET(query_cong_params_in, in, cong_protocol, cong_point);
 
-	return mlx5_cmd_exec(dev, in, sizeof(in), out, out_size);
-}
-
-int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *dev,
-				void *in, int in_size)
-{
-	u32 out[MLX5_ST_SZ_DW(modify_cong_params_out)] = { };
-
-	return mlx5_cmd_exec(dev, in, in_size, out, sizeof(out));
+	return mlx5_cmd_exec_inout(dev, query_cong_params, in, out);
 }
 
 int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr,
@@ -133,7 +98,7 @@
 		MLX5_SET64(alloc_memic_in, in, range_start_addr,
 			   hw_start_addr + (page_idx * PAGE_SIZE));
 
-		ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+		ret = mlx5_cmd_exec_inout(dev, alloc_memic, in, out);
 		if (ret) {
 			spin_lock(&dm->lock);
 			bitmap_clear(dm->memic_alloc_pages,
@@ -157,13 +122,12 @@
 	return -ENOMEM;
 }
 
-int mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length)
+void mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length)
 {
 	struct mlx5_core_dev *dev = dm->dev;
 	u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr);
 	u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE);
-	u32 out[MLX5_ST_SZ_DW(dealloc_memic_out)] = {0};
-	u32 in[MLX5_ST_SZ_DW(dealloc_memic_in)] = {0};
+	u32 in[MLX5_ST_SZ_DW(dealloc_memic_in)] = {};
 	u64 start_page_idx;
 	int err;
 
@@ -174,75 +138,58 @@
 	MLX5_SET64(dealloc_memic_in, in, memic_start_addr, addr);
 	MLX5_SET(dealloc_memic_in, in, memic_size, length);
 
-	err =  mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	err =  mlx5_cmd_exec_in(dev, dealloc_memic, in);
+	if (err)
+		return;
 
-	if (!err) {
-		spin_lock(&dm->lock);
-		bitmap_clear(dm->memic_alloc_pages,
-			     start_page_idx, num_pages);
-		spin_unlock(&dm->lock);
-	}
-
-	return err;
-}
-
-int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out)
-{
-	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {};
-	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
-
-	MLX5_SET(ppcnt_reg, in, local_port, 1);
-
-	MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP);
-	return  mlx5_core_access_reg(dev, in, sz, out, sz, MLX5_REG_PPCNT,
-				     0, 0);
+	spin_lock(&dm->lock);
+	bitmap_clear(dm->memic_alloc_pages,
+		     start_page_idx, num_pages);
+	spin_unlock(&dm->lock);
 }
 
 void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid)
 {
-	u32 in[MLX5_ST_SZ_DW(destroy_tir_in)]   = {};
-	u32 out[MLX5_ST_SZ_DW(destroy_tir_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {};
 
 	MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR);
 	MLX5_SET(destroy_tir_in, in, tirn, tirn);
 	MLX5_SET(destroy_tir_in, in, uid, uid);
-	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	mlx5_cmd_exec_in(dev, destroy_tir, in);
 }
 
 void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid)
 {
-	u32 in[MLX5_ST_SZ_DW(destroy_tis_in)]   = {0};
-	u32 out[MLX5_ST_SZ_DW(destroy_tis_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_tis_in)] = {};
 
 	MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS);
 	MLX5_SET(destroy_tis_in, in, tisn, tisn);
 	MLX5_SET(destroy_tis_in, in, uid, uid);
-	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	mlx5_cmd_exec_in(dev, destroy_tis, in);
 }
 
-void mlx5_cmd_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn, u16 uid)
+int mlx5_cmd_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn, u16 uid)
 {
-	u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)]   = {};
-	u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {};
 
 	MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT);
 	MLX5_SET(destroy_rqt_in, in, rqtn, rqtn);
 	MLX5_SET(destroy_rqt_in, in, uid, uid);
-	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	return mlx5_cmd_exec_in(dev, destroy_rqt, in);
 }
 
 int mlx5_cmd_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn,
 				    u16 uid)
 {
-	u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)]   = {0};
-	u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)] = {};
+	u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {};
 	int err;
 
 	MLX5_SET(alloc_transport_domain_in, in, opcode,
 		 MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN);
 	MLX5_SET(alloc_transport_domain_in, in, uid, uid);
 
-	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	err = mlx5_cmd_exec_inout(dev, alloc_transport_domain, in, out);
 	if (!err)
 		*tdn = MLX5_GET(alloc_transport_domain_out, out,
 				transport_domain);
@@ -253,32 +200,29 @@
 void mlx5_cmd_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn,
 				       u16 uid)
 {
-	u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)]   = {0};
-	u32 out[MLX5_ST_SZ_DW(dealloc_transport_domain_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)] = {};
 
 	MLX5_SET(dealloc_transport_domain_in, in, opcode,
 		 MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN);
 	MLX5_SET(dealloc_transport_domain_in, in, uid, uid);
 	MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn);
-	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	mlx5_cmd_exec_in(dev, dealloc_transport_domain, in);
 }
 
-void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid)
+int mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid)
 {
-	u32 out[MLX5_ST_SZ_DW(dealloc_pd_out)] = {};
-	u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)]   = {};
+	u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)] = {};
 
 	MLX5_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD);
 	MLX5_SET(dealloc_pd_in, in, pd, pdn);
 	MLX5_SET(dealloc_pd_in, in, uid, uid);
-	mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	return mlx5_cmd_exec_in(dev, dealloc_pd, in);
 }
 
 int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid,
 			u32 qpn, u16 uid)
 {
-	u32 out[MLX5_ST_SZ_DW(attach_to_mcg_out)] = {};
-	u32 in[MLX5_ST_SZ_DW(attach_to_mcg_in)]   = {};
+	u32 in[MLX5_ST_SZ_DW(attach_to_mcg_in)] = {};
 	void *gid;
 
 	MLX5_SET(attach_to_mcg_in, in, opcode, MLX5_CMD_OP_ATTACH_TO_MCG);
@@ -286,14 +230,13 @@
 	MLX5_SET(attach_to_mcg_in, in, uid, uid);
 	gid = MLX5_ADDR_OF(attach_to_mcg_in, in, multicast_gid);
 	memcpy(gid, mgid, sizeof(*mgid));
-	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	return mlx5_cmd_exec_in(dev, attach_to_mcg, in);
 }
 
 int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid,
 			u32 qpn, u16 uid)
 {
-	u32 out[MLX5_ST_SZ_DW(detach_from_mcg_out)] = {};
-	u32 in[MLX5_ST_SZ_DW(detach_from_mcg_in)]   = {};
+	u32 in[MLX5_ST_SZ_DW(detach_from_mcg_in)] = {};
 	void *gid;
 
 	MLX5_SET(detach_from_mcg_in, in, opcode, MLX5_CMD_OP_DETACH_FROM_MCG);
@@ -301,18 +244,18 @@
 	MLX5_SET(detach_from_mcg_in, in, uid, uid);
 	gid = MLX5_ADDR_OF(detach_from_mcg_in, in, multicast_gid);
 	memcpy(gid, mgid, sizeof(*mgid));
-	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	return mlx5_cmd_exec_in(dev, detach_from_mcg, in);
 }
 
 int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid)
 {
 	u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {};
-	u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)]   = {};
+	u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)] = {};
 	int err;
 
 	MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD);
 	MLX5_SET(alloc_xrcd_in, in, uid, uid);
-	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	err = mlx5_cmd_exec_inout(dev, alloc_xrcd, in, out);
 	if (!err)
 		*xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd);
 	return err;
@@ -320,30 +263,12 @@
 
 int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid)
 {
-	u32 out[MLX5_ST_SZ_DW(dealloc_xrcd_out)] = {};
-	u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)]   = {};
+	u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)] = {};
 
 	MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD);
 	MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn);
 	MLX5_SET(dealloc_xrcd_in, in, uid, uid);
-	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
-}
-
-int mlx5_cmd_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id,
-			     u16 uid)
-{
-	u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)]   = {0};
-	u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {0};
-	int err;
-
-	MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER);
-	MLX5_SET(alloc_q_counter_in, in, uid, uid);
-
-	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
-	if (!err)
-		*counter_id = MLX5_GET(alloc_q_counter_out, out,
-				       counter_set_id);
-	return err;
+	return mlx5_cmd_exec_in(dev, dealloc_xrcd, in);
 }
 
 int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
@@ -369,7 +294,7 @@
 	data = MLX5_ADDR_OF(mad_ifc_in, in, mad);
 	memcpy(data, inb, MLX5_FLD_SZ_BYTES(mad_ifc_in, mad));
 
-	err = mlx5_cmd_exec(dev, in, inlen, out, outlen);
+	err = mlx5_cmd_exec_inout(dev, mad_ifc, in, out);
 	if (err)
 		goto out;
 
diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h
index 169cab4..88ea6ef 100644
--- a/drivers/infiniband/hw/mlx5/cmd.h
+++ b/drivers/infiniband/hw/mlx5/cmd.h
@@ -40,17 +40,14 @@
 int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey);
 int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey);
 int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
-			       void *out, int out_size);
-int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out);
-int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev,
-				void *in, int in_size);
+			       void *out);
 int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr,
 			 u64 length, u32 alignment);
-int mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length);
-void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid);
+void mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length);
+int mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid);
 void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid);
 void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid);
-void mlx5_cmd_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn, u16 uid);
+int mlx5_cmd_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn, u16 uid);
 int mlx5_cmd_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn,
 				    u16 uid);
 void mlx5_cmd_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn,
@@ -61,8 +58,6 @@
 			u32 qpn, u16 uid);
 int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid);
 int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid);
-int mlx5_cmd_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id,
-			     u16 uid);
 int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
 		     u16 opmod, u8 port);
 #endif /* MLX5_IB_CMD_H */
diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c
index 8ba439f..b9291e4 100644
--- a/drivers/infiniband/hw/mlx5/cong.c
+++ b/drivers/infiniband/hw/mlx5/cong.c
@@ -47,6 +47,7 @@
 	"rp_byte_reset",
 	"rp_threshold",
 	"rp_ai_rate",
+	"rp_max_rate",
 	"rp_hai_rate",
 	"rp_min_dec_fac",
 	"rp_min_rate",
@@ -56,6 +57,7 @@
 	"rp_rate_reduce_monitor_period",
 	"rp_initial_alpha_value",
 	"rp_gd",
+	"np_min_time_between_cnps",
 	"np_cnp_dscp",
 	"np_cnp_prio_mode",
 	"np_cnp_prio",
@@ -66,6 +68,7 @@
 #define MLX5_IB_RP_TIME_RESET_ATTR			BIT(3)
 #define MLX5_IB_RP_BYTE_RESET_ATTR			BIT(4)
 #define MLX5_IB_RP_THRESHOLD_ATTR			BIT(5)
+#define MLX5_IB_RP_MAX_RATE_ATTR			BIT(6)
 #define MLX5_IB_RP_AI_RATE_ATTR				BIT(7)
 #define MLX5_IB_RP_HAI_RATE_ATTR			BIT(8)
 #define MLX5_IB_RP_MIN_DEC_FAC_ATTR			BIT(9)
@@ -77,6 +80,7 @@
 #define MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR		BIT(15)
 #define MLX5_IB_RP_GD_ATTR				BIT(16)
 
+#define MLX5_IB_NP_MIN_TIME_BETWEEN_CNPS_ATTR		BIT(2)
 #define MLX5_IB_NP_CNP_DSCP_ATTR			BIT(3)
 #define MLX5_IB_NP_CNP_PRIO_MODE_ATTR			BIT(4)
 
@@ -111,6 +115,9 @@
 	case MLX5_IB_DBG_CC_RP_AI_RATE:
 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
 				rpg_ai_rate);
+	case MLX5_IB_DBG_CC_RP_MAX_RATE:
+		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
+				rpg_max_rate);
 	case MLX5_IB_DBG_CC_RP_HAI_RATE:
 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
 				rpg_hai_rate);
@@ -138,6 +145,9 @@
 	case MLX5_IB_DBG_CC_RP_GD:
 		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
 				rpg_gd);
+	case MLX5_IB_DBG_CC_NP_MIN_TIME_BETWEEN_CNPS:
+		return MLX5_GET(cong_control_r_roce_ecn_np, field,
+				min_time_between_cnps);
 	case MLX5_IB_DBG_CC_NP_CNP_DSCP:
 		return MLX5_GET(cong_control_r_roce_ecn_np, field,
 				cnp_dscp);
@@ -186,6 +196,11 @@
 		MLX5_SET(cong_control_r_roce_ecn_rp, field,
 			 rpg_ai_rate, var);
 		break;
+	case MLX5_IB_DBG_CC_RP_MAX_RATE:
+		*attr_mask |= MLX5_IB_RP_MAX_RATE_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_rp, field,
+			 rpg_max_rate, var);
+		break;
 	case MLX5_IB_DBG_CC_RP_HAI_RATE:
 		*attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR;
 		MLX5_SET(cong_control_r_roce_ecn_rp, field,
@@ -231,6 +246,11 @@
 		MLX5_SET(cong_control_r_roce_ecn_rp, field,
 			 rpg_gd, var);
 		break;
+	case MLX5_IB_DBG_CC_NP_MIN_TIME_BETWEEN_CNPS:
+		*attr_mask |= MLX5_IB_NP_MIN_TIME_BETWEEN_CNPS_ATTR;
+		MLX5_SET(cong_control_r_roce_ecn_np, field,
+			 min_time_between_cnps, var);
+		break;
 	case MLX5_IB_DBG_CC_NP_CNP_DSCP:
 		*attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR;
 		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_dscp, var);
@@ -270,7 +290,7 @@
 
 	node = mlx5_ib_param_to_node(offset);
 
-	err = mlx5_cmd_query_cong_params(mdev, node, out, outlen);
+	err = mlx5_cmd_query_cong_params(mdev, node, out);
 	if (err)
 		goto free;
 
@@ -319,7 +339,7 @@
 	MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp,
 		 attr_mask);
 
-	err = mlx5_cmd_modify_cong_params(mdev, in, inlen);
+	err = mlx5_cmd_exec_in(dev->mdev, modify_cong_params, in);
 	kvfree(in);
 alloc_err:
 	mlx5_ib_put_native_port_mdev(dev, port_num + 1);
diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c
new file mode 100644
index 0000000..70c8fd6
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/counters.c
@@ -0,0 +1,710 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
+ */
+
+#include "mlx5_ib.h"
+#include <linux/mlx5/eswitch.h>
+#include "counters.h"
+#include "ib_rep.h"
+#include "qp.h"
+
+struct mlx5_ib_counter {
+	const char *name;
+	size_t offset;
+};
+
+#define INIT_Q_COUNTER(_name)		\
+	{ .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
+
+static const struct mlx5_ib_counter basic_q_cnts[] = {
+	INIT_Q_COUNTER(rx_write_requests),
+	INIT_Q_COUNTER(rx_read_requests),
+	INIT_Q_COUNTER(rx_atomic_requests),
+	INIT_Q_COUNTER(out_of_buffer),
+};
+
+static const struct mlx5_ib_counter out_of_seq_q_cnts[] = {
+	INIT_Q_COUNTER(out_of_sequence),
+};
+
+static const struct mlx5_ib_counter retrans_q_cnts[] = {
+	INIT_Q_COUNTER(duplicate_request),
+	INIT_Q_COUNTER(rnr_nak_retry_err),
+	INIT_Q_COUNTER(packet_seq_err),
+	INIT_Q_COUNTER(implied_nak_seq_err),
+	INIT_Q_COUNTER(local_ack_timeout_err),
+};
+
+#define INIT_CONG_COUNTER(_name)		\
+	{ .name = #_name, .offset =	\
+		MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)}
+
+static const struct mlx5_ib_counter cong_cnts[] = {
+	INIT_CONG_COUNTER(rp_cnp_ignored),
+	INIT_CONG_COUNTER(rp_cnp_handled),
+	INIT_CONG_COUNTER(np_ecn_marked_roce_packets),
+	INIT_CONG_COUNTER(np_cnp_sent),
+};
+
+static const struct mlx5_ib_counter extended_err_cnts[] = {
+	INIT_Q_COUNTER(resp_local_length_error),
+	INIT_Q_COUNTER(resp_cqe_error),
+	INIT_Q_COUNTER(req_cqe_error),
+	INIT_Q_COUNTER(req_remote_invalid_request),
+	INIT_Q_COUNTER(req_remote_access_errors),
+	INIT_Q_COUNTER(resp_remote_access_errors),
+	INIT_Q_COUNTER(resp_cqe_flush_error),
+	INIT_Q_COUNTER(req_cqe_flush_error),
+};
+
+static const struct mlx5_ib_counter roce_accl_cnts[] = {
+	INIT_Q_COUNTER(roce_adp_retrans),
+	INIT_Q_COUNTER(roce_adp_retrans_to),
+	INIT_Q_COUNTER(roce_slow_restart),
+	INIT_Q_COUNTER(roce_slow_restart_cnps),
+	INIT_Q_COUNTER(roce_slow_restart_trans),
+};
+
+#define INIT_EXT_PPCNT_COUNTER(_name)		\
+	{ .name = #_name, .offset =	\
+	MLX5_BYTE_OFF(ppcnt_reg, \
+		      counter_set.eth_extended_cntrs_grp_data_layout._name##_high)}
+
+static const struct mlx5_ib_counter ext_ppcnt_cnts[] = {
+	INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated),
+};
+
+static int mlx5_ib_read_counters(struct ib_counters *counters,
+				 struct ib_counters_read_attr *read_attr,
+				 struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
+	struct mlx5_read_counters_attr mread_attr = {};
+	struct mlx5_ib_flow_counters_desc *desc;
+	int ret, i;
+
+	mutex_lock(&mcounters->mcntrs_mutex);
+	if (mcounters->cntrs_max_index > read_attr->ncounters) {
+		ret = -EINVAL;
+		goto err_bound;
+	}
+
+	mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64),
+				 GFP_KERNEL);
+	if (!mread_attr.out) {
+		ret = -ENOMEM;
+		goto err_bound;
+	}
+
+	mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl;
+	mread_attr.flags = read_attr->flags;
+	ret = mcounters->read_counters(counters->device, &mread_attr);
+	if (ret)
+		goto err_read;
+
+	/* do the pass over the counters data array to assign according to the
+	 * descriptions and indexing pairs
+	 */
+	desc = mcounters->counters_data;
+	for (i = 0; i < mcounters->ncounters; i++)
+		read_attr->counters_buff[desc[i].index] += mread_attr.out[desc[i].description];
+
+err_read:
+	kfree(mread_attr.out);
+err_bound:
+	mutex_unlock(&mcounters->mcntrs_mutex);
+	return ret;
+}
+
+static int mlx5_ib_destroy_counters(struct ib_counters *counters)
+{
+	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
+
+	mlx5_ib_counters_clear_description(counters);
+	if (mcounters->hw_cntrs_hndl)
+		mlx5_fc_destroy(to_mdev(counters->device)->mdev,
+				mcounters->hw_cntrs_hndl);
+	return 0;
+}
+
+static int mlx5_ib_create_counters(struct ib_counters *counters,
+				   struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
+
+	mutex_init(&mcounters->mcntrs_mutex);
+	return 0;
+}
+
+
+static bool is_mdev_switchdev_mode(const struct mlx5_core_dev *mdev)
+{
+	return MLX5_ESWITCH_MANAGER(mdev) &&
+	       mlx5_ib_eswitch_mode(mdev->priv.eswitch) ==
+		       MLX5_ESWITCH_OFFLOADS;
+}
+
+static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev,
+						   u8 port_num)
+{
+	return is_mdev_switchdev_mode(dev->mdev) ? &dev->port[0].cnts :
+						   &dev->port[port_num].cnts;
+}
+
+/**
+ * mlx5_ib_get_counters_id - Returns counters id to use for device+port
+ * @dev:	Pointer to mlx5 IB device
+ * @port_num:	Zero based port number
+ *
+ * mlx5_ib_get_counters_id() Returns counters set id to use for given
+ * device port combination in switchdev and non switchdev mode of the
+ * parent device.
+ */
+u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num)
+{
+	const struct mlx5_ib_counters *cnts = get_counters(dev, port_num);
+
+	return cnts->set_id;
+}
+
+static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
+						    u8 port_num)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	const struct mlx5_ib_counters *cnts;
+	bool is_switchdev = is_mdev_switchdev_mode(dev->mdev);
+
+	if ((is_switchdev && port_num) || (!is_switchdev && !port_num))
+		return NULL;
+
+	cnts = get_counters(dev, port_num - 1);
+
+	return rdma_alloc_hw_stats_struct(cnts->names,
+					  cnts->num_q_counters +
+					  cnts->num_cong_counters +
+					  cnts->num_ext_ppcnt_counters,
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
+				    const struct mlx5_ib_counters *cnts,
+				    struct rdma_hw_stats *stats,
+				    u16 set_id)
+{
+	u32 out[MLX5_ST_SZ_DW(query_q_counter_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(query_q_counter_in)] = {};
+	__be32 val;
+	int ret, i;
+
+	MLX5_SET(query_q_counter_in, in, opcode, MLX5_CMD_OP_QUERY_Q_COUNTER);
+	MLX5_SET(query_q_counter_in, in, counter_set_id, set_id);
+	ret = mlx5_cmd_exec_inout(mdev, query_q_counter, in, out);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < cnts->num_q_counters; i++) {
+		val = *(__be32 *)((void *)out + cnts->offsets[i]);
+		stats->value[i] = (u64)be32_to_cpu(val);
+	}
+
+	return 0;
+}
+
+static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev,
+					    const struct mlx5_ib_counters *cnts,
+					    struct rdma_hw_stats *stats)
+{
+	int offset = cnts->num_q_counters + cnts->num_cong_counters;
+	u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {};
+	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
+	int ret, i;
+	void *out;
+
+	out = kvzalloc(sz, GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	MLX5_SET(ppcnt_reg, in, local_port, 1);
+	MLX5_SET(ppcnt_reg, in, grp, MLX5_ETHERNET_EXTENDED_COUNTERS_GROUP);
+	ret = mlx5_core_access_reg(dev->mdev, in, sz, out, sz, MLX5_REG_PPCNT,
+				   0, 0);
+	if (ret)
+		goto free;
+
+	for (i = 0; i < cnts->num_ext_ppcnt_counters; i++)
+		stats->value[i + offset] =
+			be64_to_cpup((__be64 *)(out +
+				    cnts->offsets[i + offset]));
+free:
+	kvfree(out);
+	return ret;
+}
+
+static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
+				struct rdma_hw_stats *stats,
+				u8 port_num, int index)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	const struct mlx5_ib_counters *cnts = get_counters(dev, port_num - 1);
+	struct mlx5_core_dev *mdev;
+	int ret, num_counters;
+	u8 mdev_port_num;
+
+	if (!stats)
+		return -EINVAL;
+
+	num_counters = cnts->num_q_counters +
+		       cnts->num_cong_counters +
+		       cnts->num_ext_ppcnt_counters;
+
+	/* q_counters are per IB device, query the master mdev */
+	ret = mlx5_ib_query_q_counters(dev->mdev, cnts, stats, cnts->set_id);
+	if (ret)
+		return ret;
+
+	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
+		ret =  mlx5_ib_query_ext_ppcnt_counters(dev, cnts, stats);
+		if (ret)
+			return ret;
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+		mdev = mlx5_ib_get_native_port_mdev(dev, port_num,
+						    &mdev_port_num);
+		if (!mdev) {
+			/* If port is not affiliated yet, its in down state
+			 * which doesn't have any counters yet, so it would be
+			 * zero. So no need to read from the HCA.
+			 */
+			goto done;
+		}
+		ret = mlx5_lag_query_cong_counters(dev->mdev,
+						   stats->value +
+						   cnts->num_q_counters,
+						   cnts->num_cong_counters,
+						   cnts->offsets +
+						   cnts->num_q_counters);
+
+		mlx5_ib_put_native_port_mdev(dev, port_num);
+		if (ret)
+			return ret;
+	}
+
+done:
+	return num_counters;
+}
+
+static struct rdma_hw_stats *
+mlx5_ib_counter_alloc_stats(struct rdma_counter *counter)
+{
+	struct mlx5_ib_dev *dev = to_mdev(counter->device);
+	const struct mlx5_ib_counters *cnts =
+		get_counters(dev, counter->port - 1);
+
+	return rdma_alloc_hw_stats_struct(cnts->names,
+					  cnts->num_q_counters +
+					  cnts->num_cong_counters +
+					  cnts->num_ext_ppcnt_counters,
+					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
+}
+
+static int mlx5_ib_counter_update_stats(struct rdma_counter *counter)
+{
+	struct mlx5_ib_dev *dev = to_mdev(counter->device);
+	const struct mlx5_ib_counters *cnts =
+		get_counters(dev, counter->port - 1);
+
+	return mlx5_ib_query_q_counters(dev->mdev, cnts,
+					counter->stats, counter->id);
+}
+
+static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
+{
+	struct mlx5_ib_dev *dev = to_mdev(counter->device);
+	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {};
+
+	if (!counter->id)
+		return 0;
+
+	MLX5_SET(dealloc_q_counter_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
+	MLX5_SET(dealloc_q_counter_in, in, counter_set_id, counter->id);
+	return mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in);
+}
+
+static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
+				   struct ib_qp *qp)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	int err;
+
+	if (!counter->id) {
+		u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {};
+		u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {};
+
+		MLX5_SET(alloc_q_counter_in, in, opcode,
+			 MLX5_CMD_OP_ALLOC_Q_COUNTER);
+		MLX5_SET(alloc_q_counter_in, in, uid, MLX5_SHARED_RESOURCE_UID);
+		err = mlx5_cmd_exec_inout(dev->mdev, alloc_q_counter, in, out);
+		if (err)
+			return err;
+		counter->id =
+			MLX5_GET(alloc_q_counter_out, out, counter_set_id);
+	}
+
+	err = mlx5_ib_qp_set_counter(qp, counter);
+	if (err)
+		goto fail_set_counter;
+
+	return 0;
+
+fail_set_counter:
+	mlx5_ib_counter_dealloc(counter);
+	counter->id = 0;
+
+	return err;
+}
+
+static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp)
+{
+	return mlx5_ib_qp_set_counter(qp, NULL);
+}
+
+
+static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
+				  const char **names,
+				  size_t *offsets)
+{
+	int i;
+	int j = 0;
+
+	for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) {
+		names[j] = basic_q_cnts[i].name;
+		offsets[j] = basic_q_cnts[i].offset;
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) {
+		for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) {
+			names[j] = out_of_seq_q_cnts[i].name;
+			offsets[j] = out_of_seq_q_cnts[i].offset;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
+		for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) {
+			names[j] = retrans_q_cnts[i].name;
+			offsets[j] = retrans_q_cnts[i].offset;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) {
+		for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) {
+			names[j] = extended_err_cnts[i].name;
+			offsets[j] = extended_err_cnts[i].offset;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, roce_accl)) {
+		for (i = 0; i < ARRAY_SIZE(roce_accl_cnts); i++, j++) {
+			names[j] = roce_accl_cnts[i].name;
+			offsets[j] = roce_accl_cnts[i].offset;
+		}
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+		for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
+			names[j] = cong_cnts[i].name;
+			offsets[j] = cong_cnts[i].offset;
+		}
+	}
+
+	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
+		for (i = 0; i < ARRAY_SIZE(ext_ppcnt_cnts); i++, j++) {
+			names[j] = ext_ppcnt_cnts[i].name;
+			offsets[j] = ext_ppcnt_cnts[i].offset;
+		}
+	}
+}
+
+
+static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
+				    struct mlx5_ib_counters *cnts)
+{
+	u32 num_counters;
+
+	num_counters = ARRAY_SIZE(basic_q_cnts);
+
+	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt))
+		num_counters += ARRAY_SIZE(out_of_seq_q_cnts);
+
+	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
+		num_counters += ARRAY_SIZE(retrans_q_cnts);
+
+	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters))
+		num_counters += ARRAY_SIZE(extended_err_cnts);
+
+	if (MLX5_CAP_GEN(dev->mdev, roce_accl))
+		num_counters += ARRAY_SIZE(roce_accl_cnts);
+
+	cnts->num_q_counters = num_counters;
+
+	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
+		cnts->num_cong_counters = ARRAY_SIZE(cong_cnts);
+		num_counters += ARRAY_SIZE(cong_cnts);
+	}
+	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
+		cnts->num_ext_ppcnt_counters = ARRAY_SIZE(ext_ppcnt_cnts);
+		num_counters += ARRAY_SIZE(ext_ppcnt_cnts);
+	}
+	cnts->names = kcalloc(num_counters, sizeof(*cnts->names), GFP_KERNEL);
+	if (!cnts->names)
+		return -ENOMEM;
+
+	cnts->offsets = kcalloc(num_counters,
+				sizeof(*cnts->offsets), GFP_KERNEL);
+	if (!cnts->offsets)
+		goto err_names;
+
+	return 0;
+
+err_names:
+	kfree(cnts->names);
+	cnts->names = NULL;
+	return -ENOMEM;
+}
+
+static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_q_counter_in)] = {};
+	int num_cnt_ports;
+	int i;
+
+	num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
+
+	MLX5_SET(dealloc_q_counter_in, in, opcode,
+		 MLX5_CMD_OP_DEALLOC_Q_COUNTER);
+
+	for (i = 0; i < num_cnt_ports; i++) {
+		if (dev->port[i].cnts.set_id) {
+			MLX5_SET(dealloc_q_counter_in, in, counter_set_id,
+				 dev->port[i].cnts.set_id);
+			mlx5_cmd_exec_in(dev->mdev, dealloc_q_counter, in);
+		}
+		kfree(dev->port[i].cnts.names);
+		kfree(dev->port[i].cnts.offsets);
+	}
+}
+
+static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
+{
+	u32 out[MLX5_ST_SZ_DW(alloc_q_counter_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(alloc_q_counter_in)] = {};
+	int num_cnt_ports;
+	int err = 0;
+	int i;
+	bool is_shared;
+
+	MLX5_SET(alloc_q_counter_in, in, opcode, MLX5_CMD_OP_ALLOC_Q_COUNTER);
+	is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0;
+	num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
+
+	for (i = 0; i < num_cnt_ports; i++) {
+		err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts);
+		if (err)
+			goto err_alloc;
+
+		mlx5_ib_fill_counters(dev, dev->port[i].cnts.names,
+				      dev->port[i].cnts.offsets);
+
+		MLX5_SET(alloc_q_counter_in, in, uid,
+			 is_shared ? MLX5_SHARED_RESOURCE_UID : 0);
+
+		err = mlx5_cmd_exec_inout(dev->mdev, alloc_q_counter, in, out);
+		if (err) {
+			mlx5_ib_warn(dev,
+				     "couldn't allocate queue counter for port %d, err %d\n",
+				     i + 1, err);
+			goto err_alloc;
+		}
+
+		dev->port[i].cnts.set_id =
+			MLX5_GET(alloc_q_counter_out, out, counter_set_id);
+	}
+	return 0;
+
+err_alloc:
+	mlx5_ib_dealloc_counters(dev);
+	return err;
+}
+
+static int read_flow_counters(struct ib_device *ibdev,
+			      struct mlx5_read_counters_attr *read_attr)
+{
+	struct mlx5_fc *fc = read_attr->hw_cntrs_hndl;
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+
+	return mlx5_fc_query(dev->mdev, fc,
+			     &read_attr->out[IB_COUNTER_PACKETS],
+			     &read_attr->out[IB_COUNTER_BYTES]);
+}
+
+/* flow counters currently expose two counters packets and bytes */
+#define FLOW_COUNTERS_NUM 2
+static int counters_set_description(
+	struct ib_counters *counters, enum mlx5_ib_counters_type counters_type,
+	struct mlx5_ib_flow_counters_desc *desc_data, u32 ncounters)
+{
+	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
+	u32 cntrs_max_index = 0;
+	int i;
+
+	if (counters_type != MLX5_IB_COUNTERS_FLOW)
+		return -EINVAL;
+
+	/* init the fields for the object */
+	mcounters->type = counters_type;
+	mcounters->read_counters = read_flow_counters;
+	mcounters->counters_num = FLOW_COUNTERS_NUM;
+	mcounters->ncounters = ncounters;
+	/* each counter entry have both description and index pair */
+	for (i = 0; i < ncounters; i++) {
+		if (desc_data[i].description > IB_COUNTER_BYTES)
+			return -EINVAL;
+
+		if (cntrs_max_index <= desc_data[i].index)
+			cntrs_max_index = desc_data[i].index + 1;
+	}
+
+	mutex_lock(&mcounters->mcntrs_mutex);
+	mcounters->counters_data = desc_data;
+	mcounters->cntrs_max_index = cntrs_max_index;
+	mutex_unlock(&mcounters->mcntrs_mutex);
+
+	return 0;
+}
+
+#define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2))
+int mlx5_ib_flow_counters_set_data(struct ib_counters *ibcounters,
+				   struct mlx5_ib_create_flow *ucmd)
+{
+	struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters);
+	struct mlx5_ib_flow_counters_data *cntrs_data = NULL;
+	struct mlx5_ib_flow_counters_desc *desc_data = NULL;
+	bool hw_hndl = false;
+	int ret = 0;
+
+	if (ucmd && ucmd->ncounters_data != 0) {
+		cntrs_data = ucmd->data;
+		if (cntrs_data->ncounters > MAX_COUNTERS_NUM)
+			return -EINVAL;
+
+		desc_data = kcalloc(cntrs_data->ncounters,
+				    sizeof(*desc_data),
+				    GFP_KERNEL);
+		if (!desc_data)
+			return  -ENOMEM;
+
+		if (copy_from_user(desc_data,
+				   u64_to_user_ptr(cntrs_data->counters_data),
+				   sizeof(*desc_data) * cntrs_data->ncounters)) {
+			ret = -EFAULT;
+			goto free;
+		}
+	}
+
+	if (!mcounters->hw_cntrs_hndl) {
+		mcounters->hw_cntrs_hndl = mlx5_fc_create(
+			to_mdev(ibcounters->device)->mdev, false);
+		if (IS_ERR(mcounters->hw_cntrs_hndl)) {
+			ret = PTR_ERR(mcounters->hw_cntrs_hndl);
+			goto free;
+		}
+		hw_hndl = true;
+	}
+
+	if (desc_data) {
+		/* counters already bound to at least one flow */
+		if (mcounters->cntrs_max_index) {
+			ret = -EINVAL;
+			goto free_hndl;
+		}
+
+		ret = counters_set_description(ibcounters,
+					       MLX5_IB_COUNTERS_FLOW,
+					       desc_data,
+					       cntrs_data->ncounters);
+		if (ret)
+			goto free_hndl;
+
+	} else if (!mcounters->cntrs_max_index) {
+		/* counters not bound yet, must have udata passed */
+		ret = -EINVAL;
+		goto free_hndl;
+	}
+
+	return 0;
+
+free_hndl:
+	if (hw_hndl) {
+		mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev,
+				mcounters->hw_cntrs_hndl);
+		mcounters->hw_cntrs_hndl = NULL;
+	}
+free:
+	kfree(desc_data);
+	return ret;
+}
+
+void mlx5_ib_counters_clear_description(struct ib_counters *counters)
+{
+	struct mlx5_ib_mcounters *mcounters;
+
+	if (!counters || atomic_read(&counters->usecnt) != 1)
+		return;
+
+	mcounters = to_mcounters(counters);
+
+	mutex_lock(&mcounters->mcntrs_mutex);
+	kfree(mcounters->counters_data);
+	mcounters->counters_data = NULL;
+	mcounters->cntrs_max_index = 0;
+	mutex_unlock(&mcounters->mcntrs_mutex);
+}
+
+static const struct ib_device_ops hw_stats_ops = {
+	.alloc_hw_stats = mlx5_ib_alloc_hw_stats,
+	.get_hw_stats = mlx5_ib_get_hw_stats,
+	.counter_bind_qp = mlx5_ib_counter_bind_qp,
+	.counter_unbind_qp = mlx5_ib_counter_unbind_qp,
+	.counter_dealloc = mlx5_ib_counter_dealloc,
+	.counter_alloc_stats = mlx5_ib_counter_alloc_stats,
+	.counter_update_stats = mlx5_ib_counter_update_stats,
+};
+
+static const struct ib_device_ops counters_ops = {
+	.create_counters = mlx5_ib_create_counters,
+	.destroy_counters = mlx5_ib_destroy_counters,
+	.read_counters = mlx5_ib_read_counters,
+
+	INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs),
+};
+
+int mlx5_ib_counters_init(struct mlx5_ib_dev *dev)
+{
+	ib_set_device_ops(&dev->ib_dev, &counters_ops);
+
+	if (!MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
+		return 0;
+
+	ib_set_device_ops(&dev->ib_dev, &hw_stats_ops);
+	return mlx5_ib_alloc_counters(dev);
+}
+
+void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev)
+{
+	if (!MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
+		return;
+
+	mlx5_ib_dealloc_counters(dev);
+}
diff --git a/drivers/infiniband/hw/mlx5/counters.h b/drivers/infiniband/hw/mlx5/counters.h
new file mode 100644
index 0000000..1aa30c2
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/counters.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
+ */
+
+#ifndef _MLX5_IB_COUNTERS_H
+#define _MLX5_IB_COUNTERS_H
+
+#include "mlx5_ib.h"
+
+int mlx5_ib_counters_init(struct mlx5_ib_dev *dev);
+void mlx5_ib_counters_cleanup(struct mlx5_ib_dev *dev);
+void mlx5_ib_counters_clear_description(struct ib_counters *counters);
+int mlx5_ib_flow_counters_set_data(struct ib_counters *ibcounters,
+				   struct mlx5_ib_create_flow *ucmd);
+u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num);
+#endif /* _MLX5_IB_COUNTERS_H */
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
index 23ce012..74644b6 100644
--- a/drivers/infiniband/hw/mlx5/cq.c
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -36,6 +36,7 @@
 #include <rdma/ib_cache.h>
 #include "mlx5_ib.h"
 #include "srq.h"
+#include "qp.h"
 
 static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe)
 {
@@ -120,13 +121,13 @@
 	switch (be32_to_cpu(cqe->sop_drop_qpn) >> 24) {
 	case MLX5_OPCODE_RDMA_WRITE_IMM:
 		wc->wc_flags |= IB_WC_WITH_IMM;
-		/* fall through */
+		fallthrough;
 	case MLX5_OPCODE_RDMA_WRITE:
 		wc->opcode    = IB_WC_RDMA_WRITE;
 		break;
 	case MLX5_OPCODE_SEND_IMM:
 		wc->wc_flags |= IB_WC_WITH_IMM;
-		/* fall through */
+		fallthrough;
 	case MLX5_OPCODE_SEND:
 	case MLX5_OPCODE_SEND_INVAL:
 		wc->opcode    = IB_WC_SEND;
@@ -202,7 +203,7 @@
 	case MLX5_CQE_RESP_WR_IMM:
 		wc->opcode	= IB_WC_RECV_RDMA_WITH_IMM;
 		wc->wc_flags	= IB_WC_WITH_IMM;
-		wc->ex.imm_data = cqe->imm_inval_pkey;
+		wc->ex.imm_data = cqe->immediate;
 		break;
 	case MLX5_CQE_RESP_SEND:
 		wc->opcode   = IB_WC_RECV;
@@ -214,12 +215,12 @@
 	case MLX5_CQE_RESP_SEND_IMM:
 		wc->opcode	= IB_WC_RECV;
 		wc->wc_flags	= IB_WC_WITH_IMM;
-		wc->ex.imm_data = cqe->imm_inval_pkey;
+		wc->ex.imm_data = cqe->immediate;
 		break;
 	case MLX5_CQE_RESP_SEND_INV:
 		wc->opcode	= IB_WC_RECV;
 		wc->wc_flags	= IB_WC_WITH_INVALIDATE;
-		wc->ex.invalidate_rkey = be32_to_cpu(cqe->imm_inval_pkey);
+		wc->ex.invalidate_rkey = be32_to_cpu(cqe->inval_rkey);
 		break;
 	}
 	wc->src_qp	   = be32_to_cpu(cqe->flags_rqpn) & 0xffffff;
@@ -227,7 +228,7 @@
 	g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3;
 	wc->wc_flags |= g ? IB_WC_GRH : 0;
 	if (unlikely(is_qp1(qp->ibqp.qp_type))) {
-		u16 pkey = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff;
+		u16 pkey = be32_to_cpu(cqe->pkey) & 0xffff;
 
 		ib_find_cached_pkey(&dev->ib_dev, qp->port, pkey,
 				    &wc->pkey_index);
@@ -254,7 +255,7 @@
 
 	switch (roce_packet_type) {
 	case MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH:
-		wc->network_hdr_type = RDMA_NETWORK_IB;
+		wc->network_hdr_type = RDMA_NETWORK_ROCE_V1;
 		break;
 	case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6:
 		wc->network_hdr_type = RDMA_NETWORK_IPV6;
@@ -446,9 +447,6 @@
 	struct mlx5_cqe64 *cqe64;
 	struct mlx5_core_qp *mqp;
 	struct mlx5_ib_wq *wq;
-	struct mlx5_sig_err_cqe *sig_err_cqe;
-	struct mlx5_core_mkey *mmkey;
-	struct mlx5_ib_mr *mr;
 	uint8_t opcode;
 	uint32_t qpn;
 	u16 wqe_ctr;
@@ -488,7 +486,7 @@
 		 * because CQs will be locked while QPs are removed
 		 * from the table.
 		 */
-		mqp = __mlx5_qp_lookup(dev->mdev, qpn);
+		mqp = radix_tree_lookup(&dev->qp_table.tree, qpn);
 		*cur_qp = to_mibqp(mqp);
 	}
 
@@ -543,27 +541,29 @@
 			}
 		}
 		break;
-	case MLX5_CQE_SIG_ERR:
-		sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64;
+	case MLX5_CQE_SIG_ERR: {
+		struct mlx5_sig_err_cqe *sig_err_cqe =
+			(struct mlx5_sig_err_cqe *)cqe64;
+		struct mlx5_core_sig_ctx *sig;
 
-		xa_lock(&dev->mdev->priv.mkey_table);
-		mmkey = xa_load(&dev->mdev->priv.mkey_table,
+		xa_lock(&dev->sig_mrs);
+		sig = xa_load(&dev->sig_mrs,
 				mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey)));
-		mr = to_mibmr(mmkey);
-		get_sig_err_item(sig_err_cqe, &mr->sig->err_item);
-		mr->sig->sig_err_exists = true;
-		mr->sig->sigerr_count++;
+		get_sig_err_item(sig_err_cqe, &sig->err_item);
+		sig->sig_err_exists = true;
+		sig->sigerr_count++;
 
 		mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n",
-			     cq->mcq.cqn, mr->sig->err_item.key,
-			     mr->sig->err_item.err_type,
-			     mr->sig->err_item.sig_err_offset,
-			     mr->sig->err_item.expected,
-			     mr->sig->err_item.actual);
+			     cq->mcq.cqn, sig->err_item.key,
+			     sig->err_item.err_type,
+			     sig->err_item.sig_err_offset,
+			     sig->err_item.expected,
+			     sig->err_item.actual);
 
-		xa_unlock(&dev->mdev->priv.mkey_table);
+		xa_unlock(&dev->sig_mrs);
 		goto repoll;
 	}
+	}
 
 	return 0;
 }
@@ -717,24 +717,26 @@
 	struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
 		udata, struct mlx5_ib_ucontext, ibucontext);
 
-	ucmdlen = udata->inlen < sizeof(ucmd) ?
-		  (sizeof(ucmd) - sizeof(ucmd.flags)) : sizeof(ucmd);
+	ucmdlen = min(udata->inlen, sizeof(ucmd));
+	if (ucmdlen < offsetof(struct mlx5_ib_create_cq, flags))
+		return -EINVAL;
 
 	if (ib_copy_from_udata(&ucmd, udata, ucmdlen))
 		return -EFAULT;
 
-	if (ucmdlen == sizeof(ucmd) &&
-	    (ucmd.flags & ~(MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD)))
+	if ((ucmd.flags & ~(MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD |
+			    MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX)))
 		return -EINVAL;
 
-	if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128)
+	if ((ucmd.cqe_size != 64 && ucmd.cqe_size != 128) ||
+	    ucmd.reserved0 || ucmd.reserved1)
 		return -EINVAL;
 
 	*cqe_size = ucmd.cqe_size;
 
 	cq->buf.umem =
-		ib_umem_get(udata, ucmd.buf_addr, entries * ucmd.cqe_size,
-			    IB_ACCESS_LOCAL_WRITE, 1);
+		ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
+			    entries * ucmd.cqe_size, IB_ACCESS_LOCAL_WRITE);
 	if (IS_ERR(cq->buf.umem)) {
 		err = PTR_ERR(cq->buf.umem);
 		return err;
@@ -764,7 +766,14 @@
 	MLX5_SET(cqc, cqc, log_page_size,
 		 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
 
-	*index = context->bfregi.sys_pages[0];
+	if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX) {
+		*index = ucmd.uar_page_index;
+	} else if (context->bfregi.lib_uar_dyn) {
+		err = -EINVAL;
+		goto err_cqb;
+	} else {
+		*index = context->bfregi.sys_pages[0];
+	}
 
 	if (ucmd.cqe_comp_en == 1) {
 		int mini_cqe_format;
@@ -916,12 +925,11 @@
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	struct mlx5_ib_cq *cq = to_mcq(ibcq);
 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
-	int uninitialized_var(index);
-	int uninitialized_var(inlen);
+	int index;
+	int inlen;
 	u32 *cqb = NULL;
 	void *cqc;
 	int cqe_size;
-	unsigned int irqn;
 	int eqn;
 	int err;
 
@@ -960,7 +968,7 @@
 		INIT_WORK(&cq->notify_work, notify_soft_wc_handler);
 	}
 
-	err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn);
+	err = mlx5_vector2eqn(dev->mdev, vector, &eqn);
 	if (err)
 		goto err_cqb;
 
@@ -983,7 +991,6 @@
 		goto err_cqb;
 
 	mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn);
-	cq->mcq.irqn = irqn;
 	if (udata)
 		cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp;
 	else
@@ -1014,16 +1021,21 @@
 	return err;
 }
 
-void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(cq->device);
 	struct mlx5_ib_cq *mcq = to_mcq(cq);
+	int ret;
 
-	mlx5_core_destroy_cq(dev->mdev, &mcq->mcq);
+	ret = mlx5_core_destroy_cq(dev->mdev, &mcq->mcq);
+	if (ret)
+		return ret;
+
 	if (udata)
 		destroy_cq_user(mcq, udata);
 	else
 		destroy_cq_kernel(dev, mcq);
+	return 0;
 }
 
 static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn)
@@ -1132,9 +1144,9 @@
 	if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1)
 		return -EINVAL;
 
-	umem = ib_umem_get(udata, ucmd.buf_addr,
+	umem = ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
 			   (size_t)ucmd.cqe_size * entries,
-			   IB_ACCESS_LOCAL_WRITE, 1);
+			   IB_ACCESS_LOCAL_WRITE);
 	if (IS_ERR(umem)) {
 		err = PTR_ERR(umem);
 		return err;
@@ -1237,7 +1249,7 @@
 	__be64 *pas;
 	int page_shift;
 	int inlen;
-	int uninitialized_var(cqe_size);
+	int cqe_size;
 	unsigned long flags;
 
 	if (!MLX5_CAP_GEN(dev->mdev, cq_resize)) {
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 664e0f3..343e670 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -14,6 +14,8 @@
 #include <linux/mlx5/driver.h>
 #include <linux/mlx5/fs.h>
 #include "mlx5_ib.h"
+#include "devx.h"
+#include "qp.h"
 #include <linux/xarray.h>
 
 #define UVERBS_MODULE_NAME mlx5_ib
@@ -30,7 +32,7 @@
 struct devx_async_data {
 	struct mlx5_ib_dev *mdev;
 	struct list_head list;
-	struct ib_uobject *fd_uobj;
+	struct devx_async_cmd_event_file *ev_file;
 	struct mlx5_async_work cb_work;
 	u16 cmd_out_len;
 	/* must be last field in this structure */
@@ -72,7 +74,6 @@
 	struct rcu_head	rcu;
 	u64 cookie;
 	struct devx_async_event_file *ev_file;
-	struct file *filp; /* Upon hot unplug we need a direct access to */
 	struct eventfd_ctx *eventfd;
 };
 
@@ -89,21 +90,6 @@
 	u8 is_destroyed:1;
 };
 
-#define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in)
-struct devx_obj {
-	struct mlx5_ib_dev	*ib_dev;
-	u64			obj_id;
-	u32			dinlen; /* destroy inbox length */
-	u32			dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW];
-	u32			flags;
-	union {
-		struct mlx5_ib_devx_mr	devx_mr;
-		struct mlx5_core_dct	core_dct;
-		struct mlx5_core_cq	core_cq;
-	};
-	struct list_head event_sub; /* holds devx_event_subscription entries */
-};
-
 struct devx_umem {
 	struct mlx5_core_dev		*mdev;
 	struct ib_umem			*umem;
@@ -170,43 +156,6 @@
 	mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
 }
 
-bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type)
-{
-	struct devx_obj *devx_obj = obj;
-	u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode);
-
-	switch (opcode) {
-	case MLX5_CMD_OP_DESTROY_TIR:
-		*dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR;
-		*dest_id = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox,
-				    obj_id);
-		return true;
-
-	case MLX5_CMD_OP_DESTROY_FLOW_TABLE:
-		*dest_type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
-		*dest_id = MLX5_GET(destroy_flow_table_in, devx_obj->dinbox,
-				    table_id);
-		return true;
-	default:
-		return false;
-	}
-}
-
-bool mlx5_ib_devx_is_flow_counter(void *obj, u32 *counter_id)
-{
-	struct devx_obj *devx_obj = obj;
-	u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode);
-
-	if (opcode == MLX5_CMD_OP_DEALLOC_FLOW_COUNTER) {
-		*counter_id = MLX5_GET(dealloc_flow_counter_in,
-				       devx_obj->dinbox,
-				       flow_counter_id);
-		return true;
-	}
-
-	return false;
-}
-
 static bool is_legacy_unaffiliated_event_num(u16 event_num)
 {
 	switch (event_num) {
@@ -610,10 +559,9 @@
 	case UVERBS_OBJECT_QP:
 	{
 		struct mlx5_ib_qp *qp = to_mqp(uobj->object);
-		enum ib_qp_type	qp_type = qp->ibqp.qp_type;
 
-		if (qp_type == IB_QPT_RAW_PACKET ||
-		    (qp->flags & MLX5_IB_QP_UNDERLAY)) {
+		if (qp->type == IB_QPT_RAW_PACKET ||
+		    (qp->flags & IB_QP_CREATE_SOURCE_QPN)) {
 			struct mlx5_ib_raw_packet_qp *raw_packet_qp =
 							 &qp->raw_packet_qp;
 			struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
@@ -629,10 +577,9 @@
 					       sq->tisn) == obj_id);
 		}
 
-		if (qp_type == MLX5_IB_QPT_DCT)
+		if (qp->type == MLX5_IB_QPT_DCT)
 			return get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT,
 					      qp->dct.mdct.mqp.qpn) == obj_id;
-
 		return get_enc_obj_id(MLX5_CMD_OP_CREATE_QP,
 				      qp->ibqp.qp_num) == obj_id;
 	}
@@ -957,7 +904,6 @@
 	struct mlx5_ib_dev *dev;
 	int user_vector;
 	int dev_eqn;
-	unsigned int irqn;
 	int err;
 
 	if (uverbs_copy_from(&user_vector, attrs,
@@ -969,7 +915,7 @@
 		return PTR_ERR(c);
 	dev = to_mdev(c->ibucontext.device);
 
-	err = mlx5_vector2eqn(dev->mdev, user_vector, &dev_eqn, &irqn);
+	err = mlx5_vector2eqn(dev->mdev, user_vector, &dev_eqn);
 	if (err < 0)
 		return err;
 
@@ -1272,8 +1218,8 @@
 	mkey->pd = MLX5_GET(mkc, mkc, pd);
 	devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size);
 
-	return xa_err(xa_store(&dev->mdev->priv.mkey_table,
-			       mlx5_base_mkey(mkey->key), mkey, GFP_KERNEL));
+	return xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mkey->key), mkey,
+			       GFP_KERNEL));
 }
 
 static int devx_handle_mkey_create(struct mlx5_ib_dev *dev,
@@ -1352,13 +1298,13 @@
 		 * the mmkey, we must wait for that to stop before freeing the
 		 * mkey, as another allocation could get the same mkey #.
 		 */
-		xa_erase(&obj->ib_dev->mdev->priv.mkey_table,
+		xa_erase(&obj->ib_dev->odp_mkeys,
 			 mlx5_base_mkey(obj->devx_mr.mmkey.key));
-		synchronize_srcu(&dev->mr_srcu);
+		synchronize_srcu(&dev->odp_srcu);
 	}
 
 	if (obj->flags & DEVX_OBJ_FLAGS_DCT)
-		ret = mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct);
+		ret = mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct);
 	else if (obj->flags & DEVX_OBJ_FLAGS_CQ)
 		ret = mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq);
 	else
@@ -1452,9 +1398,8 @@
 
 	if (opcode == MLX5_CMD_OP_CREATE_DCT) {
 		obj->flags |= DEVX_OBJ_FLAGS_DCT;
-		err = mlx5_core_create_dct(dev->mdev, &obj->core_dct,
-					   cmd_in, cmd_in_len,
-					   cmd_out, cmd_out_len);
+		err = mlx5_core_create_dct(dev, &obj->core_dct, cmd_in,
+					   cmd_in_len, cmd_out, cmd_out_len);
 	} else if (opcode == MLX5_CMD_OP_CREATE_CQ) {
 		obj->flags |= DEVX_OBJ_FLAGS_CQ;
 		obj->core_cq.comp = devx_cq_comp;
@@ -1470,6 +1415,13 @@
 	if (err)
 		goto obj_free;
 
+	if (opcode == MLX5_CMD_OP_ALLOC_FLOW_COUNTER) {
+		u8 bulk = MLX5_GET(alloc_flow_counter_in,
+				   cmd_in,
+				   flow_counter_bulk);
+		obj->flow_counter_bulk_size = 128UL * bulk;
+	}
+
 	uobj->object = obj;
 	INIT_LIST_HEAD(&obj->event_sub);
 	obj->ib_dev = dev;
@@ -1494,7 +1446,7 @@
 
 obj_destroy:
 	if (obj->flags & DEVX_OBJ_FLAGS_DCT)
-		mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct);
+		mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct);
 	else if (obj->flags & DEVX_OBJ_FLAGS_CQ)
 		mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq);
 	else
@@ -1668,21 +1620,20 @@
 {
 	struct devx_async_data *async_data =
 		container_of(context, struct devx_async_data, cb_work);
-	struct ib_uobject *fd_uobj = async_data->fd_uobj;
-	struct devx_async_cmd_event_file *ev_file;
-	struct devx_async_event_queue *ev_queue;
+	struct devx_async_cmd_event_file *ev_file = async_data->ev_file;
+	struct devx_async_event_queue *ev_queue = &ev_file->ev_queue;
 	unsigned long flags;
 
-	ev_file = container_of(fd_uobj, struct devx_async_cmd_event_file,
-			       uobj);
-	ev_queue = &ev_file->ev_queue;
-
+	/*
+	 * Note that if the struct devx_async_cmd_event_file uobj begins to be
+	 * destroyed it will block at mlx5_cmd_cleanup_async_ctx() until this
+	 * routine returns, ensuring that it always remains valid here.
+	 */
 	spin_lock_irqsave(&ev_queue->lock, flags);
 	list_add_tail(&async_data->list, &ev_queue->event_list);
 	spin_unlock_irqrestore(&ev_queue->lock, flags);
 
 	wake_up_interruptible(&ev_queue->poll_wait);
-	fput(fd_uobj->object);
 }
 
 #define MAX_ASYNC_BYTES_IN_USE (1024 * 1024) /* 1MB */
@@ -1751,9 +1702,8 @@
 
 	async_data->cmd_out_len = cmd_out_len;
 	async_data->mdev = mdev;
-	async_data->fd_uobj = fd_uobj;
+	async_data->ev_file = ev_file;
 
-	get_file(fd_uobj->object);
 	MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid);
 	err = mlx5_cmd_exec_cb(&ev_file->async_ctx, cmd_in,
 		    uverbs_attr_get_len(attrs,
@@ -1763,12 +1713,10 @@
 		    devx_query_callback, &async_data->cb_work);
 
 	if (err)
-		goto cb_err;
+		goto free_async;
 
 	return 0;
 
-cb_err:
-	fput(fd_uobj->object);
 free_async:
 	kvfree(async_data);
 sub_bytes:
@@ -2028,6 +1976,7 @@
 		}
 
 		list_add_tail(&event_sub->event_list, &sub_list);
+		uverbs_uobject_get(&ev_file->uobj);
 		if (use_eventfd) {
 			event_sub->eventfd =
 				eventfd_ctx_fdget(redirect_fd);
@@ -2041,7 +1990,6 @@
 
 		event_sub->cookie = cookie;
 		event_sub->ev_file = ev_file;
-		event_sub->filp = fd_uobj->object;
 		/* May be needed upon cleanup the devx object/subscription */
 		event_sub->xa_key_level1 = key_level1;
 		event_sub->xa_key_level2 = obj_id;
@@ -2095,7 +2043,7 @@
 
 		if (event_sub->eventfd)
 			eventfd_ctx_put(event_sub->eventfd);
-
+		uverbs_uobject_put(&event_sub->ev_file->uobj);
 		kfree(event_sub);
 	}
 
@@ -2130,7 +2078,7 @@
 	if (err)
 		return err;
 
-	obj->umem = ib_umem_get(&attrs->driver_udata, addr, size, access, 0);
+	obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access);
 	if (IS_ERR(obj->umem))
 		return PTR_ERR(obj->umem);
 
@@ -2218,14 +2166,12 @@
 	obj->mdev = dev->mdev;
 	uobj->object = obj;
 	devx_obj_build_destroy_cmd(cmd.in, cmd.out, obj->dinbox, &obj->dinlen, &obj_id);
-	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, &obj_id, sizeof(obj_id));
-	if (err)
-		goto err_umem_destroy;
+	uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE);
 
-	return 0;
+	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_UMEM_REG_OUT_ID, &obj_id,
+			     sizeof(obj_id));
+	return err;
 
-err_umem_destroy:
-	mlx5_cmd_exec(obj->mdev, obj->dinbox, obj->dinlen, cmd.out, sizeof(cmd.out));
 err_umem_release:
 	ib_umem_release(obj->umem);
 err_obj_free:
@@ -2320,7 +2266,8 @@
 
 	if (ev_file->omit_data) {
 		spin_lock_irqsave(&ev_file->lock, flags);
-		if (!list_empty(&event_sub->event_list)) {
+		if (!list_empty(&event_sub->event_list) ||
+		    ev_file->is_destroyed) {
 			spin_unlock_irqrestore(&ev_file->lock, flags);
 			return 0;
 		}
@@ -2344,7 +2291,10 @@
 	memcpy(event_data->hdr.out_data, data, sizeof(struct mlx5_eqe));
 
 	spin_lock_irqsave(&ev_file->lock, flags);
-	list_add_tail(&event_data->list, &ev_file->event_list);
+	if (!ev_file->is_destroyed)
+		list_add_tail(&event_data->list, &ev_file->event_list);
+	else
+		kfree(event_data);
 	spin_unlock_irqrestore(&ev_file->lock, flags);
 	wake_up_interruptible(&ev_file->poll_wait);
 
@@ -2357,17 +2307,10 @@
 	struct devx_event_subscription *item;
 
 	list_for_each_entry_rcu(item, fd_list, xa_list) {
-		if (!get_file_rcu(item->filp))
-			continue;
-
-		if (item->eventfd) {
+		if (item->eventfd)
 			eventfd_signal(item->eventfd, 1);
-			fput(item->filp);
-			continue;
-		}
-
-		deliver_event(item, data);
-		fput(item->filp);
+		else
+			deliver_event(item, data);
 	}
 }
 
@@ -2420,17 +2363,24 @@
 	return NOTIFY_OK;
 }
 
-void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev)
+int mlx5_ib_devx_init(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_devx_event_table *table = &dev->devx_event_table;
+	int uid;
 
-	xa_init(&table->event_xa);
-	mutex_init(&table->event_xa_lock);
-	MLX5_NB_INIT(&table->devx_nb, devx_event_notifier, NOTIFY_ANY);
-	mlx5_eq_notifier_register(dev->mdev, &table->devx_nb);
+	uid = mlx5_ib_devx_create(dev, false);
+	if (uid > 0) {
+		dev->devx_whitelist_uid = uid;
+		xa_init(&table->event_xa);
+		mutex_init(&table->event_xa_lock);
+		MLX5_NB_INIT(&table->devx_nb, devx_event_notifier, NOTIFY_ANY);
+		mlx5_eq_notifier_register(dev->mdev, &table->devx_nb);
+	}
+
+	return 0;
 }
 
-void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev)
+void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_devx_event_table *table = &dev->devx_event_table;
 	struct devx_event_subscription *sub, *tmp;
@@ -2438,17 +2388,21 @@
 	void *entry;
 	unsigned long id;
 
-	mlx5_eq_notifier_unregister(dev->mdev, &table->devx_nb);
-	mutex_lock(&dev->devx_event_table.event_xa_lock);
-	xa_for_each(&table->event_xa, id, entry) {
-		event = entry;
-		list_for_each_entry_safe(sub, tmp, &event->unaffiliated_list,
-					 xa_list)
-			devx_cleanup_subscription(dev, sub);
-		kfree(entry);
+	if (dev->devx_whitelist_uid) {
+		mlx5_eq_notifier_unregister(dev->mdev, &table->devx_nb);
+		mutex_lock(&dev->devx_event_table.event_xa_lock);
+		xa_for_each(&table->event_xa, id, entry) {
+			event = entry;
+			list_for_each_entry_safe(
+				sub, tmp, &event->unaffiliated_list, xa_list)
+				devx_cleanup_subscription(dev, sub);
+			kfree(entry);
+		}
+		mutex_unlock(&dev->devx_event_table.event_xa_lock);
+		xa_destroy(&table->event_xa);
+
+		mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid);
 	}
-	mutex_unlock(&dev->devx_event_table.event_xa_lock);
-	xa_destroy(&table->event_xa);
 }
 
 static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf,
@@ -2475,11 +2429,11 @@
 			return -ERESTARTSYS;
 		}
 
-		if (list_empty(&ev_queue->event_list) &&
-		    ev_queue->is_destroyed)
-			return -EIO;
-
 		spin_lock_irq(&ev_queue->lock);
+		if (ev_queue->is_destroyed) {
+			spin_unlock_irq(&ev_queue->lock);
+			return -EIO;
+		}
 	}
 
 	event = list_entry(ev_queue->event_list.next,
@@ -2505,23 +2459,6 @@
 	return ret;
 }
 
-static int devx_async_cmd_event_close(struct inode *inode, struct file *filp)
-{
-	struct ib_uobject *uobj = filp->private_data;
-	struct devx_async_cmd_event_file *comp_ev_file = container_of(
-		uobj, struct devx_async_cmd_event_file, uobj);
-	struct devx_async_data *entry, *tmp;
-
-	spin_lock_irq(&comp_ev_file->ev_queue.lock);
-	list_for_each_entry_safe(entry, tmp,
-				 &comp_ev_file->ev_queue.event_list, list)
-		kvfree(entry);
-	spin_unlock_irq(&comp_ev_file->ev_queue.lock);
-
-	uverbs_close_fd(filp);
-	return 0;
-}
-
 static __poll_t devx_async_cmd_event_poll(struct file *filp,
 					      struct poll_table_struct *wait)
 {
@@ -2545,7 +2482,7 @@
 	.owner	 = THIS_MODULE,
 	.read	 = devx_async_cmd_event_read,
 	.poll    = devx_async_cmd_event_poll,
-	.release = devx_async_cmd_event_close,
+	.release = uverbs_uobject_fd_release,
 	.llseek	 = no_llseek,
 };
 
@@ -2554,7 +2491,7 @@
 {
 	struct devx_async_event_file *ev_file = filp->private_data;
 	struct devx_event_subscription *event_sub;
-	struct devx_async_event_data *uninitialized_var(event);
+	struct devx_async_event_data *event;
 	int ret = 0;
 	size_t eventsz;
 	bool omit_data;
@@ -2570,10 +2507,6 @@
 		return -EOVERFLOW;
 	}
 
-	if (ev_file->is_destroyed) {
-		spin_unlock_irq(&ev_file->lock);
-		return -EIO;
-	}
 
 	while (list_empty(&ev_file->event_list)) {
 		spin_unlock_irq(&ev_file->lock);
@@ -2649,81 +2582,96 @@
 	return pollflags;
 }
 
-static int devx_async_event_close(struct inode *inode, struct file *filp)
+static void devx_free_subscription(struct rcu_head *rcu)
 {
-	struct devx_async_event_file *ev_file = filp->private_data;
-	struct devx_event_subscription *event_sub, *event_sub_tmp;
-	struct devx_async_event_data *entry, *tmp;
-	struct mlx5_ib_dev *dev = ev_file->dev;
+	struct devx_event_subscription *event_sub =
+		container_of(rcu, struct devx_event_subscription, rcu);
 
-	mutex_lock(&dev->devx_event_table.event_xa_lock);
-	/* delete the subscriptions which are related to this FD */
-	list_for_each_entry_safe(event_sub, event_sub_tmp,
-				 &ev_file->subscribed_events_list, file_list) {
-		devx_cleanup_subscription(dev, event_sub);
-		if (event_sub->eventfd)
-			eventfd_ctx_put(event_sub->eventfd);
-
-		list_del_rcu(&event_sub->file_list);
-		/* subscription may not be used by the read API any more */
-		kfree_rcu(event_sub, rcu);
-	}
-
-	mutex_unlock(&dev->devx_event_table.event_xa_lock);
-
-	/* free the pending events allocation */
-	if (!ev_file->omit_data) {
-		spin_lock_irq(&ev_file->lock);
-		list_for_each_entry_safe(entry, tmp,
-					 &ev_file->event_list, list)
-			kfree(entry); /* read can't come any more */
-		spin_unlock_irq(&ev_file->lock);
-	}
-
-	uverbs_close_fd(filp);
-	put_device(&dev->ib_dev.dev);
-	return 0;
+	if (event_sub->eventfd)
+		eventfd_ctx_put(event_sub->eventfd);
+	uverbs_uobject_put(&event_sub->ev_file->uobj);
+	kfree(event_sub);
 }
 
 static const struct file_operations devx_async_event_fops = {
 	.owner	 = THIS_MODULE,
 	.read	 = devx_async_event_read,
 	.poll    = devx_async_event_poll,
-	.release = devx_async_event_close,
+	.release = uverbs_uobject_fd_release,
 	.llseek	 = no_llseek,
 };
 
-static int devx_hot_unplug_async_cmd_event_file(struct ib_uobject *uobj,
-						   enum rdma_remove_reason why)
+static int devx_async_cmd_event_destroy_uobj(struct ib_uobject *uobj,
+					     enum rdma_remove_reason why)
 {
 	struct devx_async_cmd_event_file *comp_ev_file =
 		container_of(uobj, struct devx_async_cmd_event_file,
 			     uobj);
 	struct devx_async_event_queue *ev_queue = &comp_ev_file->ev_queue;
+	struct devx_async_data *entry, *tmp;
 
 	spin_lock_irq(&ev_queue->lock);
 	ev_queue->is_destroyed = 1;
 	spin_unlock_irq(&ev_queue->lock);
-
-	if (why == RDMA_REMOVE_DRIVER_REMOVE)
-		wake_up_interruptible(&ev_queue->poll_wait);
+	wake_up_interruptible(&ev_queue->poll_wait);
 
 	mlx5_cmd_cleanup_async_ctx(&comp_ev_file->async_ctx);
+
+	spin_lock_irq(&comp_ev_file->ev_queue.lock);
+	list_for_each_entry_safe(entry, tmp,
+				 &comp_ev_file->ev_queue.event_list, list) {
+		list_del(&entry->list);
+		kvfree(entry);
+	}
+	spin_unlock_irq(&comp_ev_file->ev_queue.lock);
 	return 0;
 };
 
-static int devx_hot_unplug_async_event_file(struct ib_uobject *uobj,
-					    enum rdma_remove_reason why)
+static int devx_async_event_destroy_uobj(struct ib_uobject *uobj,
+					 enum rdma_remove_reason why)
 {
 	struct devx_async_event_file *ev_file =
 		container_of(uobj, struct devx_async_event_file,
 			     uobj);
+	struct devx_event_subscription *event_sub, *event_sub_tmp;
+	struct mlx5_ib_dev *dev = ev_file->dev;
 
 	spin_lock_irq(&ev_file->lock);
 	ev_file->is_destroyed = 1;
-	spin_unlock_irq(&ev_file->lock);
 
+	/* free the pending events allocation */
+	if (ev_file->omit_data) {
+		struct devx_event_subscription *event_sub, *tmp;
+
+		list_for_each_entry_safe(event_sub, tmp, &ev_file->event_list,
+					 event_list)
+			list_del_init(&event_sub->event_list);
+
+	} else {
+		struct devx_async_event_data *entry, *tmp;
+
+		list_for_each_entry_safe(entry, tmp, &ev_file->event_list,
+					 list) {
+			list_del(&entry->list);
+			kfree(entry);
+		}
+	}
+
+	spin_unlock_irq(&ev_file->lock);
 	wake_up_interruptible(&ev_file->poll_wait);
+
+	mutex_lock(&dev->devx_event_table.event_xa_lock);
+	/* delete the subscriptions which are related to this FD */
+	list_for_each_entry_safe(event_sub, event_sub_tmp,
+				 &ev_file->subscribed_events_list, file_list) {
+		devx_cleanup_subscription(dev, event_sub);
+		list_del_rcu(&event_sub->file_list);
+		/* subscription may not be used by the read API any more */
+		call_rcu(&event_sub->rcu, devx_free_subscription);
+	}
+	mutex_unlock(&dev->devx_event_table.event_xa_lock);
+
+	put_device(&dev->ib_dev.dev);
 	return 0;
 };
 
@@ -2909,7 +2857,7 @@
 DECLARE_UVERBS_NAMED_OBJECT(
 	MLX5_IB_OBJECT_DEVX_ASYNC_CMD_FD,
 	UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_cmd_event_file),
-			     devx_hot_unplug_async_cmd_event_file,
+			     devx_async_cmd_event_destroy_uobj,
 			     &devx_async_cmd_event_fops, "[devx_async_cmd]",
 			     O_RDONLY),
 	&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC));
@@ -2927,7 +2875,7 @@
 DECLARE_UVERBS_NAMED_OBJECT(
 	MLX5_IB_OBJECT_DEVX_ASYNC_EVENT_FD,
 	UVERBS_TYPE_ALLOC_FD(sizeof(struct devx_async_event_file),
-			     devx_hot_unplug_async_event_file,
+			     devx_async_event_destroy_uobj,
 			     &devx_async_event_fops, "[devx_async_event]",
 			     O_RDONLY),
 	&UVERBS_METHOD(MLX5_IB_METHOD_DEVX_ASYNC_EVENT_FD_ALLOC));
diff --git a/drivers/infiniband/hw/mlx5/devx.h b/drivers/infiniband/hw/mlx5/devx.h
new file mode 100644
index 0000000..1f69866
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/devx.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2019-2020, Mellanox Technologies inc. All rights reserved.
+ */
+
+#ifndef _MLX5_IB_DEVX_H
+#define _MLX5_IB_DEVX_H
+
+#include "mlx5_ib.h"
+
+#define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in)
+struct devx_obj {
+	struct mlx5_ib_dev	*ib_dev;
+	u64			obj_id;
+	u32			dinlen; /* destroy inbox length */
+	u32			dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW];
+	u32			flags;
+	union {
+		struct mlx5_ib_devx_mr	devx_mr;
+		struct mlx5_core_dct	core_dct;
+		struct mlx5_core_cq	core_cq;
+		u32			flow_counter_bulk_size;
+	};
+	struct list_head event_sub; /* holds devx_event_subscription entries */
+};
+#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
+int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user);
+void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid);
+int mlx5_ib_devx_init(struct mlx5_ib_dev *dev);
+void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev);
+#else
+static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user)
+{
+	return -EOPNOTSUPP;
+}
+static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) {}
+static inline int mlx5_ib_devx_init(struct mlx5_ib_dev *dev)
+{
+	return 0;
+}
+static inline void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev)
+{
+}
+#endif
+#endif /* _MLX5_IB_DEVX_H */
diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c
index 8f4e5f2..61475b5 100644
--- a/drivers/infiniband/hw/mlx5/doorbell.c
+++ b/drivers/infiniband/hw/mlx5/doorbell.c
@@ -64,7 +64,8 @@
 
 	page->user_virt = (virt & PAGE_MASK);
 	page->refcnt    = 0;
-	page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0, 0);
+	page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK,
+				 PAGE_SIZE, 0);
 	if (IS_ERR(page->umem)) {
 		err = PTR_ERR(page->umem);
 		kfree(page);
diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c
deleted file mode 100644
index fddefb2..0000000
--- a/drivers/infiniband/hw/mlx5/flow.c
+++ /dev/null
@@ -1,705 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
-/*
- * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
- */
-
-#include <rdma/ib_user_verbs.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/uverbs_types.h>
-#include <rdma/uverbs_ioctl.h>
-#include <rdma/uverbs_std_types.h>
-#include <rdma/mlx5_user_ioctl_cmds.h>
-#include <rdma/mlx5_user_ioctl_verbs.h>
-#include <rdma/ib_umem.h>
-#include <linux/mlx5/driver.h>
-#include <linux/mlx5/fs.h>
-#include <linux/mlx5/eswitch.h>
-#include "mlx5_ib.h"
-
-#define UVERBS_MODULE_NAME mlx5_ib
-#include <rdma/uverbs_named_ioctl.h>
-
-static int
-mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type,
-			     enum mlx5_flow_namespace_type *namespace)
-{
-	switch (table_type) {
-	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX:
-		*namespace = MLX5_FLOW_NAMESPACE_BYPASS;
-		break;
-	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX:
-		*namespace = MLX5_FLOW_NAMESPACE_EGRESS;
-		break;
-	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB:
-		*namespace = MLX5_FLOW_NAMESPACE_FDB;
-		break;
-	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_RX:
-		*namespace = MLX5_FLOW_NAMESPACE_RDMA_RX;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static const struct uverbs_attr_spec mlx5_ib_flow_type[] = {
-	[MLX5_IB_FLOW_TYPE_NORMAL] = {
-		.type = UVERBS_ATTR_TYPE_PTR_IN,
-		.u.ptr = {
-			.len = sizeof(u16), /* data is priority */
-			.min_len = sizeof(u16),
-		}
-	},
-	[MLX5_IB_FLOW_TYPE_SNIFFER] = {
-		.type = UVERBS_ATTR_TYPE_PTR_IN,
-		UVERBS_ATTR_NO_DATA(),
-	},
-	[MLX5_IB_FLOW_TYPE_ALL_DEFAULT] = {
-		.type = UVERBS_ATTR_TYPE_PTR_IN,
-		UVERBS_ATTR_NO_DATA(),
-	},
-	[MLX5_IB_FLOW_TYPE_MC_DEFAULT] = {
-		.type = UVERBS_ATTR_TYPE_PTR_IN,
-		UVERBS_ATTR_NO_DATA(),
-	},
-};
-
-#define MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS 2
-static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
-	struct uverbs_attr_bundle *attrs)
-{
-	struct mlx5_flow_context flow_context = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG};
-	struct mlx5_ib_flow_handler *flow_handler;
-	struct mlx5_ib_flow_matcher *fs_matcher;
-	struct ib_uobject **arr_flow_actions;
-	struct ib_uflow_resources *uflow_res;
-	struct mlx5_flow_act flow_act = {};
-	void *devx_obj;
-	int dest_id, dest_type;
-	void *cmd_in;
-	int inlen;
-	bool dest_devx, dest_qp;
-	struct ib_qp *qp = NULL;
-	struct ib_uobject *uobj =
-		uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE);
-	struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata);
-	int len, ret, i;
-	u32 counter_id = 0;
-
-	if (!capable(CAP_NET_RAW))
-		return -EPERM;
-
-	dest_devx =
-		uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX);
-	dest_qp = uverbs_attr_is_valid(attrs,
-				       MLX5_IB_ATTR_CREATE_FLOW_DEST_QP);
-
-	fs_matcher = uverbs_attr_get_obj(attrs,
-					 MLX5_IB_ATTR_CREATE_FLOW_MATCHER);
-	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS &&
-	    ((dest_devx && dest_qp) || (!dest_devx && !dest_qp)))
-		return -EINVAL;
-
-	/* Allow only DEVX object as dest when inserting to FDB */
-	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB && !dest_devx)
-		return -EINVAL;
-
-	/* Allow only DEVX object or QP as dest when inserting to RDMA_RX */
-	if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) &&
-	    ((!dest_devx && !dest_qp) || (dest_devx && dest_qp)))
-		return -EINVAL;
-
-	if (dest_devx) {
-		devx_obj = uverbs_attr_get_obj(
-			attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX);
-		if (IS_ERR(devx_obj))
-			return PTR_ERR(devx_obj);
-
-		/* Verify that the given DEVX object is a flow
-		 * steering destination.
-		 */
-		if (!mlx5_ib_devx_is_flow_dest(devx_obj, &dest_id, &dest_type))
-			return -EINVAL;
-		/* Allow only flow table as dest when inserting to FDB or RDMA_RX */
-		if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB ||
-		     fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) &&
-		    dest_type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
-			return -EINVAL;
-	} else if (dest_qp) {
-		struct mlx5_ib_qp *mqp;
-
-		qp = uverbs_attr_get_obj(attrs,
-					 MLX5_IB_ATTR_CREATE_FLOW_DEST_QP);
-		if (IS_ERR(qp))
-			return PTR_ERR(qp);
-
-		if (qp->qp_type != IB_QPT_RAW_PACKET)
-			return -EINVAL;
-
-		mqp = to_mqp(qp);
-		if (mqp->flags & MLX5_IB_QP_RSS)
-			dest_id = mqp->rss_qp.tirn;
-		else
-			dest_id = mqp->raw_packet_qp.rq.tirn;
-		dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR;
-	} else {
-		dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT;
-	}
-
-	len = uverbs_attr_get_uobjs_arr(attrs,
-		MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions);
-	if (len) {
-		devx_obj = arr_flow_actions[0]->object;
-
-		if (!mlx5_ib_devx_is_flow_counter(devx_obj, &counter_id))
-			return -EINVAL;
-		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
-	}
-
-	if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR &&
-	    fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS)
-		return -EINVAL;
-
-	cmd_in = uverbs_attr_get_alloced_ptr(
-		attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE);
-	inlen = uverbs_attr_get_len(attrs,
-				    MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE);
-
-	uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS);
-	if (!uflow_res)
-		return -ENOMEM;
-
-	len = uverbs_attr_get_uobjs_arr(attrs,
-		MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions);
-	for (i = 0; i < len; i++) {
-		struct mlx5_ib_flow_action *maction =
-			to_mflow_act(arr_flow_actions[i]->object);
-
-		ret = parse_flow_flow_action(maction, false, &flow_act);
-		if (ret)
-			goto err_out;
-		flow_resources_add(uflow_res, IB_FLOW_SPEC_ACTION_HANDLE,
-				   arr_flow_actions[i]->object);
-	}
-
-	ret = uverbs_copy_from(&flow_context.flow_tag, attrs,
-			       MLX5_IB_ATTR_CREATE_FLOW_TAG);
-	if (!ret) {
-		if (flow_context.flow_tag >= BIT(24)) {
-			ret = -EINVAL;
-			goto err_out;
-		}
-		flow_context.flags |= FLOW_CONTEXT_HAS_TAG;
-	}
-
-	flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher,
-					       &flow_context,
-					       &flow_act,
-					       counter_id,
-					       cmd_in, inlen,
-					       dest_id, dest_type);
-	if (IS_ERR(flow_handler)) {
-		ret = PTR_ERR(flow_handler);
-		goto err_out;
-	}
-
-	ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, uflow_res);
-
-	return 0;
-err_out:
-	ib_uverbs_flow_resources_free(uflow_res);
-	return ret;
-}
-
-static int flow_matcher_cleanup(struct ib_uobject *uobject,
-				enum rdma_remove_reason why,
-				struct uverbs_attr_bundle *attrs)
-{
-	struct mlx5_ib_flow_matcher *obj = uobject->object;
-	int ret;
-
-	ret = ib_destroy_usecnt(&obj->usecnt, why, uobject);
-	if (ret)
-		return ret;
-
-	kfree(obj);
-	return 0;
-}
-
-static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs,
-			      struct mlx5_ib_flow_matcher *obj)
-{
-	enum mlx5_ib_uapi_flow_table_type ft_type =
-		MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX;
-	u32 flags;
-	int err;
-
-	/* New users should use MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE and older
-	 * users should switch to it. We leave this to not break userspace
-	 */
-	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE) &&
-	    uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS))
-		return -EINVAL;
-
-	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE)) {
-		err = uverbs_get_const(&ft_type, attrs,
-				       MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE);
-		if (err)
-			return err;
-
-		err = mlx5_ib_ft_type_to_namespace(ft_type, &obj->ns_type);
-		if (err)
-			return err;
-
-		return 0;
-	}
-
-	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS)) {
-		err = uverbs_get_flags32(&flags, attrs,
-					 MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS,
-					 IB_FLOW_ATTR_FLAGS_EGRESS);
-		if (err)
-			return err;
-
-		if (flags) {
-			mlx5_ib_ft_type_to_namespace(
-				MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX,
-				&obj->ns_type);
-			return 0;
-		}
-	}
-
-	obj->ns_type = MLX5_FLOW_NAMESPACE_BYPASS;
-
-	return 0;
-}
-
-static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)(
-	struct uverbs_attr_bundle *attrs)
-{
-	struct ib_uobject *uobj = uverbs_attr_get_uobject(
-		attrs, MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE);
-	struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata);
-	struct mlx5_ib_flow_matcher *obj;
-	int err;
-
-	obj = kzalloc(sizeof(struct mlx5_ib_flow_matcher), GFP_KERNEL);
-	if (!obj)
-		return -ENOMEM;
-
-	obj->mask_len = uverbs_attr_get_len(
-		attrs, MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK);
-	err = uverbs_copy_from(&obj->matcher_mask,
-			       attrs,
-			       MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK);
-	if (err)
-		goto end;
-
-	obj->flow_type = uverbs_attr_get_enum_id(
-		attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE);
-
-	if (obj->flow_type == MLX5_IB_FLOW_TYPE_NORMAL) {
-		err = uverbs_copy_from(&obj->priority,
-				       attrs,
-				       MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE);
-		if (err)
-			goto end;
-	}
-
-	err = uverbs_copy_from(&obj->match_criteria_enable,
-			       attrs,
-			       MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA);
-	if (err)
-		goto end;
-
-	err = mlx5_ib_matcher_ns(attrs, obj);
-	if (err)
-		goto end;
-
-	if (obj->ns_type == MLX5_FLOW_NAMESPACE_FDB &&
-	    mlx5_eswitch_mode(dev->mdev->priv.eswitch) !=
-			      MLX5_ESWITCH_OFFLOADS) {
-		err = -EINVAL;
-		goto end;
-	}
-
-	uobj->object = obj;
-	obj->mdev = dev->mdev;
-	atomic_set(&obj->usecnt, 0);
-	return 0;
-
-end:
-	kfree(obj);
-	return err;
-}
-
-void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction)
-{
-	switch (maction->flow_action_raw.sub_type) {
-	case MLX5_IB_FLOW_ACTION_MODIFY_HEADER:
-		mlx5_modify_header_dealloc(maction->flow_action_raw.dev->mdev,
-					   maction->flow_action_raw.modify_hdr);
-		break;
-	case MLX5_IB_FLOW_ACTION_PACKET_REFORMAT:
-		mlx5_packet_reformat_dealloc(maction->flow_action_raw.dev->mdev,
-					     maction->flow_action_raw.pkt_reformat);
-		break;
-	case MLX5_IB_FLOW_ACTION_DECAP:
-		break;
-	default:
-		break;
-	}
-}
-
-static struct ib_flow_action *
-mlx5_ib_create_modify_header(struct mlx5_ib_dev *dev,
-			     enum mlx5_ib_uapi_flow_table_type ft_type,
-			     u8 num_actions, void *in)
-{
-	enum mlx5_flow_namespace_type namespace;
-	struct mlx5_ib_flow_action *maction;
-	int ret;
-
-	ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace);
-	if (ret)
-		return ERR_PTR(-EINVAL);
-
-	maction = kzalloc(sizeof(*maction), GFP_KERNEL);
-	if (!maction)
-		return ERR_PTR(-ENOMEM);
-
-	maction->flow_action_raw.modify_hdr =
-		mlx5_modify_header_alloc(dev->mdev, namespace, num_actions, in);
-
-	if (IS_ERR(maction->flow_action_raw.modify_hdr)) {
-		ret = PTR_ERR(maction->flow_action_raw.modify_hdr);
-		kfree(maction);
-		return ERR_PTR(ret);
-	}
-	maction->flow_action_raw.sub_type =
-		MLX5_IB_FLOW_ACTION_MODIFY_HEADER;
-	maction->flow_action_raw.dev = dev;
-
-	return &maction->ib_action;
-}
-
-static bool mlx5_ib_modify_header_supported(struct mlx5_ib_dev *dev)
-{
-	return MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
-					 max_modify_header_actions) ||
-	       MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, max_modify_header_actions);
-}
-
-static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)(
-	struct uverbs_attr_bundle *attrs)
-{
-	struct ib_uobject *uobj = uverbs_attr_get_uobject(
-		attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE);
-	struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata);
-	enum mlx5_ib_uapi_flow_table_type ft_type;
-	struct ib_flow_action *action;
-	int num_actions;
-	void *in;
-	int ret;
-
-	if (!mlx5_ib_modify_header_supported(mdev))
-		return -EOPNOTSUPP;
-
-	in = uverbs_attr_get_alloced_ptr(attrs,
-		MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM);
-
-	num_actions = uverbs_attr_ptr_get_array_size(
-		attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM,
-		MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto));
-	if (num_actions < 0)
-		return num_actions;
-
-	ret = uverbs_get_const(&ft_type, attrs,
-			       MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE);
-	if (ret)
-		return ret;
-	action = mlx5_ib_create_modify_header(mdev, ft_type, num_actions, in);
-	if (IS_ERR(action))
-		return PTR_ERR(action);
-
-	uverbs_flow_action_fill_action(action, uobj, &mdev->ib_dev,
-				       IB_FLOW_ACTION_UNSPECIFIED);
-
-	return 0;
-}
-
-static bool mlx5_ib_flow_action_packet_reformat_valid(struct mlx5_ib_dev *ibdev,
-						      u8 packet_reformat_type,
-						      u8 ft_type)
-{
-	switch (packet_reformat_type) {
-	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL:
-		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX)
-			return MLX5_CAP_FLOWTABLE(ibdev->mdev,
-						  encap_general_header);
-		break;
-	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL:
-		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX)
-			return MLX5_CAP_FLOWTABLE_NIC_TX(ibdev->mdev,
-				reformat_l2_to_l3_tunnel);
-		break;
-	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2:
-		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX)
-			return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev,
-				reformat_l3_tunnel_to_l2);
-		break;
-	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2:
-		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX)
-			return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev, decap);
-		break;
-	default:
-		break;
-	}
-
-	return false;
-}
-
-static int mlx5_ib_dv_to_prm_packet_reforamt_type(u8 dv_prt, u8 *prm_prt)
-{
-	switch (dv_prt) {
-	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL:
-		*prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL;
-		break;
-	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2:
-		*prm_prt = MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2;
-		break;
-	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL:
-		*prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int mlx5_ib_flow_action_create_packet_reformat_ctx(
-	struct mlx5_ib_dev *dev,
-	struct mlx5_ib_flow_action *maction,
-	u8 ft_type, u8 dv_prt,
-	void *in, size_t len)
-{
-	enum mlx5_flow_namespace_type namespace;
-	u8 prm_prt;
-	int ret;
-
-	ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace);
-	if (ret)
-		return ret;
-
-	ret = mlx5_ib_dv_to_prm_packet_reforamt_type(dv_prt, &prm_prt);
-	if (ret)
-		return ret;
-
-	maction->flow_action_raw.pkt_reformat =
-		mlx5_packet_reformat_alloc(dev->mdev, prm_prt, len,
-					   in, namespace);
-	if (IS_ERR(maction->flow_action_raw.pkt_reformat)) {
-		ret = PTR_ERR(maction->flow_action_raw.pkt_reformat);
-		return ret;
-	}
-
-	maction->flow_action_raw.sub_type =
-		MLX5_IB_FLOW_ACTION_PACKET_REFORMAT;
-	maction->flow_action_raw.dev = dev;
-
-	return 0;
-}
-
-static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)(
-	struct uverbs_attr_bundle *attrs)
-{
-	struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs,
-		MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE);
-	struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata);
-	enum mlx5_ib_uapi_flow_action_packet_reformat_type dv_prt;
-	enum mlx5_ib_uapi_flow_table_type ft_type;
-	struct mlx5_ib_flow_action *maction;
-	int ret;
-
-	ret = uverbs_get_const(&ft_type, attrs,
-			       MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE);
-	if (ret)
-		return ret;
-
-	ret = uverbs_get_const(&dv_prt, attrs,
-			       MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE);
-	if (ret)
-		return ret;
-
-	if (!mlx5_ib_flow_action_packet_reformat_valid(mdev, dv_prt, ft_type))
-		return -EOPNOTSUPP;
-
-	maction = kzalloc(sizeof(*maction), GFP_KERNEL);
-	if (!maction)
-		return -ENOMEM;
-
-	if (dv_prt ==
-	    MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2) {
-		maction->flow_action_raw.sub_type =
-			MLX5_IB_FLOW_ACTION_DECAP;
-		maction->flow_action_raw.dev = mdev;
-	} else {
-		void *in;
-		int len;
-
-		in = uverbs_attr_get_alloced_ptr(attrs,
-			MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF);
-		if (IS_ERR(in)) {
-			ret = PTR_ERR(in);
-			goto free_maction;
-		}
-
-		len = uverbs_attr_get_len(attrs,
-			MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF);
-
-		ret = mlx5_ib_flow_action_create_packet_reformat_ctx(mdev,
-			maction, ft_type, dv_prt, in, len);
-		if (ret)
-			goto free_maction;
-	}
-
-	uverbs_flow_action_fill_action(&maction->ib_action, uobj, &mdev->ib_dev,
-				       IB_FLOW_ACTION_UNSPECIFIED);
-	return 0;
-
-free_maction:
-	kfree(maction);
-	return ret;
-}
-
-DECLARE_UVERBS_NAMED_METHOD(
-	MLX5_IB_METHOD_CREATE_FLOW,
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE,
-			UVERBS_OBJECT_FLOW,
-			UVERBS_ACCESS_NEW,
-			UA_MANDATORY),
-	UVERBS_ATTR_PTR_IN(
-		MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE,
-		UVERBS_ATTR_SIZE(1, sizeof(struct mlx5_ib_match_params)),
-		UA_MANDATORY,
-		UA_ALLOC_AND_COPY),
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_MATCHER,
-			MLX5_IB_OBJECT_FLOW_MATCHER,
-			UVERBS_ACCESS_READ,
-			UA_MANDATORY),
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_QP,
-			UVERBS_OBJECT_QP,
-			UVERBS_ACCESS_READ),
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX,
-			MLX5_IB_OBJECT_DEVX_OBJ,
-			UVERBS_ACCESS_READ),
-	UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS,
-			     UVERBS_OBJECT_FLOW_ACTION,
-			     UVERBS_ACCESS_READ, 1,
-			     MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS,
-			     UA_OPTIONAL),
-	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_TAG,
-			   UVERBS_ATTR_TYPE(u32),
-			   UA_OPTIONAL),
-	UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX,
-			     MLX5_IB_OBJECT_DEVX_OBJ,
-			     UVERBS_ACCESS_READ, 1, 1,
-			     UA_OPTIONAL));
-
-DECLARE_UVERBS_NAMED_METHOD_DESTROY(
-	MLX5_IB_METHOD_DESTROY_FLOW,
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE,
-			UVERBS_OBJECT_FLOW,
-			UVERBS_ACCESS_DESTROY,
-			UA_MANDATORY));
-
-ADD_UVERBS_METHODS(mlx5_ib_fs,
-		   UVERBS_OBJECT_FLOW,
-		   &UVERBS_METHOD(MLX5_IB_METHOD_CREATE_FLOW),
-		   &UVERBS_METHOD(MLX5_IB_METHOD_DESTROY_FLOW));
-
-DECLARE_UVERBS_NAMED_METHOD(
-	MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER,
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE,
-			UVERBS_OBJECT_FLOW_ACTION,
-			UVERBS_ACCESS_NEW,
-			UA_MANDATORY),
-	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM,
-			   UVERBS_ATTR_MIN_SIZE(MLX5_UN_SZ_BYTES(
-				   set_action_in_add_action_in_auto)),
-			   UA_MANDATORY,
-			   UA_ALLOC_AND_COPY),
-	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE,
-			     enum mlx5_ib_uapi_flow_table_type,
-			     UA_MANDATORY));
-
-DECLARE_UVERBS_NAMED_METHOD(
-	MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT,
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE,
-			UVERBS_OBJECT_FLOW_ACTION,
-			UVERBS_ACCESS_NEW,
-			UA_MANDATORY),
-	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF,
-			   UVERBS_ATTR_MIN_SIZE(1),
-			   UA_ALLOC_AND_COPY,
-			   UA_OPTIONAL),
-	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE,
-			     enum mlx5_ib_uapi_flow_action_packet_reformat_type,
-			     UA_MANDATORY),
-	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE,
-			     enum mlx5_ib_uapi_flow_table_type,
-			     UA_MANDATORY));
-
-ADD_UVERBS_METHODS(
-	mlx5_ib_flow_actions,
-	UVERBS_OBJECT_FLOW_ACTION,
-	&UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER),
-	&UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT));
-
-DECLARE_UVERBS_NAMED_METHOD(
-	MLX5_IB_METHOD_FLOW_MATCHER_CREATE,
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE,
-			MLX5_IB_OBJECT_FLOW_MATCHER,
-			UVERBS_ACCESS_NEW,
-			UA_MANDATORY),
-	UVERBS_ATTR_PTR_IN(
-		MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK,
-		UVERBS_ATTR_SIZE(1, sizeof(struct mlx5_ib_match_params)),
-		UA_MANDATORY),
-	UVERBS_ATTR_ENUM_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE,
-			    mlx5_ib_flow_type,
-			    UA_MANDATORY),
-	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA,
-			   UVERBS_ATTR_TYPE(u8),
-			   UA_MANDATORY),
-	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS,
-			     enum ib_flow_flags,
-			     UA_OPTIONAL),
-	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE,
-			     enum mlx5_ib_uapi_flow_table_type,
-			     UA_OPTIONAL));
-
-DECLARE_UVERBS_NAMED_METHOD_DESTROY(
-	MLX5_IB_METHOD_FLOW_MATCHER_DESTROY,
-	UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_DESTROY_HANDLE,
-			MLX5_IB_OBJECT_FLOW_MATCHER,
-			UVERBS_ACCESS_DESTROY,
-			UA_MANDATORY));
-
-DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER,
-			    UVERBS_TYPE_ALLOC_IDR(flow_matcher_cleanup),
-			    &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE),
-			    &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY));
-
-const struct uapi_definition mlx5_ib_flow_defs[] = {
-	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
-		MLX5_IB_OBJECT_FLOW_MATCHER),
-	UAPI_DEF_CHAIN_OBJ_TREE(
-		UVERBS_OBJECT_FLOW,
-		&mlx5_ib_fs),
-	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
-				&mlx5_ib_flow_actions),
-	{},
-};
diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c
new file mode 100644
index 0000000..b3391ec
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/fs.c
@@ -0,0 +1,2544 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2018, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/uverbs_types.h>
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/uverbs_std_types.h>
+#include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/mlx5_user_ioctl_verbs.h>
+#include <rdma/ib_umem.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/fs.h>
+#include <linux/mlx5/fs_helpers.h>
+#include <linux/mlx5/accel.h>
+#include <linux/mlx5/eswitch.h>
+#include "mlx5_ib.h"
+#include "counters.h"
+#include "devx.h"
+#include "fs.h"
+
+#define UVERBS_MODULE_NAME mlx5_ib
+#include <rdma/uverbs_named_ioctl.h>
+
+enum {
+	MATCH_CRITERIA_ENABLE_OUTER_BIT,
+	MATCH_CRITERIA_ENABLE_MISC_BIT,
+	MATCH_CRITERIA_ENABLE_INNER_BIT,
+	MATCH_CRITERIA_ENABLE_MISC2_BIT
+};
+
+#define HEADER_IS_ZERO(match_criteria, headers)			           \
+	!(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
+		    0, MLX5_FLD_SZ_BYTES(fte_match_param, headers)))       \
+
+static u8 get_match_criteria_enable(u32 *match_criteria)
+{
+	u8 match_criteria_enable;
+
+	match_criteria_enable =
+		(!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
+		MATCH_CRITERIA_ENABLE_OUTER_BIT;
+	match_criteria_enable |=
+		(!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
+		MATCH_CRITERIA_ENABLE_MISC_BIT;
+	match_criteria_enable |=
+		(!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
+		MATCH_CRITERIA_ENABLE_INNER_BIT;
+	match_criteria_enable |=
+		(!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) <<
+		MATCH_CRITERIA_ENABLE_MISC2_BIT;
+
+	return match_criteria_enable;
+}
+
+static int set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
+{
+	u8 entry_mask;
+	u8 entry_val;
+	int err = 0;
+
+	if (!mask)
+		goto out;
+
+	entry_mask = MLX5_GET(fte_match_set_lyr_2_4, outer_c,
+			      ip_protocol);
+	entry_val = MLX5_GET(fte_match_set_lyr_2_4, outer_v,
+			     ip_protocol);
+	if (!entry_mask) {
+		MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
+		MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
+		goto out;
+	}
+	/* Don't override existing ip protocol */
+	if (mask != entry_mask || val != entry_val)
+		err = -EINVAL;
+out:
+	return err;
+}
+
+static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val,
+			   bool inner)
+{
+	if (inner) {
+		MLX5_SET(fte_match_set_misc,
+			 misc_c, inner_ipv6_flow_label, mask);
+		MLX5_SET(fte_match_set_misc,
+			 misc_v, inner_ipv6_flow_label, val);
+	} else {
+		MLX5_SET(fte_match_set_misc,
+			 misc_c, outer_ipv6_flow_label, mask);
+		MLX5_SET(fte_match_set_misc,
+			 misc_v, outer_ipv6_flow_label, val);
+	}
+}
+
+static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
+{
+	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask);
+	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val);
+	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2);
+	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
+}
+
+static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask)
+{
+	if (MLX5_GET(fte_match_mpls, set_mask, mpls_label) &&
+	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_LABEL))
+		return -EOPNOTSUPP;
+
+	if (MLX5_GET(fte_match_mpls, set_mask, mpls_exp) &&
+	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_EXP))
+		return -EOPNOTSUPP;
+
+	if (MLX5_GET(fte_match_mpls, set_mask, mpls_s_bos) &&
+	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_S_BOS))
+		return -EOPNOTSUPP;
+
+	if (MLX5_GET(fte_match_mpls, set_mask, mpls_ttl) &&
+	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_TTL))
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
+#define LAST_ETH_FIELD vlan_tag
+#define LAST_IB_FIELD sl
+#define LAST_IPV4_FIELD tos
+#define LAST_IPV6_FIELD traffic_class
+#define LAST_TCP_UDP_FIELD src_port
+#define LAST_TUNNEL_FIELD tunnel_id
+#define LAST_FLOW_TAG_FIELD tag_id
+#define LAST_DROP_FIELD size
+#define LAST_COUNTERS_FIELD counters
+
+/* Field is the last supported field */
+#define FIELDS_NOT_SUPPORTED(filter, field)                                    \
+	memchr_inv((void *)&filter.field + sizeof(filter.field), 0,            \
+		   sizeof(filter) - offsetofend(typeof(filter), field))
+
+int parse_flow_flow_action(struct mlx5_ib_flow_action *maction,
+			   bool is_egress,
+			   struct mlx5_flow_act *action)
+{
+
+	switch (maction->ib_action.type) {
+	case IB_FLOW_ACTION_ESP:
+		if (action->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+				      MLX5_FLOW_CONTEXT_ACTION_DECRYPT))
+			return -EINVAL;
+		/* Currently only AES_GCM keymat is supported by the driver */
+		action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx;
+		action->action |= is_egress ?
+			MLX5_FLOW_CONTEXT_ACTION_ENCRYPT :
+			MLX5_FLOW_CONTEXT_ACTION_DECRYPT;
+		return 0;
+	case IB_FLOW_ACTION_UNSPECIFIED:
+		if (maction->flow_action_raw.sub_type ==
+		    MLX5_IB_FLOW_ACTION_MODIFY_HEADER) {
+			if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+				return -EINVAL;
+			action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
+			action->modify_hdr =
+				maction->flow_action_raw.modify_hdr;
+			return 0;
+		}
+		if (maction->flow_action_raw.sub_type ==
+		    MLX5_IB_FLOW_ACTION_DECAP) {
+			if (action->action & MLX5_FLOW_CONTEXT_ACTION_DECAP)
+				return -EINVAL;
+			action->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
+			return 0;
+		}
+		if (maction->flow_action_raw.sub_type ==
+		    MLX5_IB_FLOW_ACTION_PACKET_REFORMAT) {
+			if (action->action &
+			    MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
+				return -EINVAL;
+			action->action |=
+				MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
+			action->pkt_reformat =
+				maction->flow_action_raw.pkt_reformat;
+			return 0;
+		}
+		fallthrough;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static int parse_flow_attr(struct mlx5_core_dev *mdev,
+			   struct mlx5_flow_spec *spec,
+			   const union ib_flow_spec *ib_spec,
+			   const struct ib_flow_attr *flow_attr,
+			   struct mlx5_flow_act *action, u32 prev_type)
+{
+	struct mlx5_flow_context *flow_context = &spec->flow_context;
+	u32 *match_c = spec->match_criteria;
+	u32 *match_v = spec->match_value;
+	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					   misc_parameters);
+	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					   misc_parameters);
+	void *misc_params2_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					    misc_parameters_2);
+	void *misc_params2_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					    misc_parameters_2);
+	void *headers_c;
+	void *headers_v;
+	int match_ipv;
+	int ret;
+
+	if (ib_spec->type & IB_FLOW_SPEC_INNER) {
+		headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					 inner_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					 inner_headers);
+		match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.inner_ip_version);
+	} else {
+		headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
+					 outer_headers);
+		headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
+					 outer_headers);
+		match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.outer_ip_version);
+	}
+
+	switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
+	case IB_FLOW_SPEC_ETH:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
+			return -EOPNOTSUPP;
+
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+					     dmac_47_16),
+				ib_spec->eth.mask.dst_mac);
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+					     dmac_47_16),
+				ib_spec->eth.val.dst_mac);
+
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+					     smac_47_16),
+				ib_spec->eth.mask.src_mac);
+		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+					     smac_47_16),
+				ib_spec->eth.val.src_mac);
+
+		if (ib_spec->eth.mask.vlan_tag) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 cvlan_tag, 1);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 cvlan_tag, 1);
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 first_vid, ntohs(ib_spec->eth.val.vlan_tag));
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 first_cfi,
+				 ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 first_cfi,
+				 ntohs(ib_spec->eth.val.vlan_tag) >> 12);
+
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 first_prio,
+				 ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 first_prio,
+				 ntohs(ib_spec->eth.val.vlan_tag) >> 13);
+		}
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+			 ethertype, ntohs(ib_spec->eth.mask.ether_type));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+			 ethertype, ntohs(ib_spec->eth.val.ether_type));
+		break;
+	case IB_FLOW_SPEC_IPV4:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
+			return -EOPNOTSUPP;
+
+		if (match_ipv) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ip_version, 0xf);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ip_version, MLX5_FS_IPV4_VERSION);
+		} else {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ethertype, 0xffff);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ethertype, ETH_P_IP);
+		}
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       &ib_spec->ipv4.mask.src_ip,
+		       sizeof(ib_spec->ipv4.mask.src_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
+		       &ib_spec->ipv4.val.src_ip,
+		       sizeof(ib_spec->ipv4.val.src_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       &ib_spec->ipv4.mask.dst_ip,
+		       sizeof(ib_spec->ipv4.mask.dst_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
+		       &ib_spec->ipv4.val.dst_ip,
+		       sizeof(ib_spec->ipv4.val.dst_ip));
+
+		set_tos(headers_c, headers_v,
+			ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
+
+		if (set_proto(headers_c, headers_v,
+			      ib_spec->ipv4.mask.proto,
+			      ib_spec->ipv4.val.proto))
+			return -EINVAL;
+		break;
+	case IB_FLOW_SPEC_IPV6:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
+			return -EOPNOTSUPP;
+
+		if (match_ipv) {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ip_version, 0xf);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ip_version, MLX5_FS_IPV6_VERSION);
+		} else {
+			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
+				 ethertype, 0xffff);
+			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
+				 ethertype, ETH_P_IPV6);
+		}
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &ib_spec->ipv6.mask.src_ip,
+		       sizeof(ib_spec->ipv6.mask.src_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
+		       &ib_spec->ipv6.val.src_ip,
+		       sizeof(ib_spec->ipv6.val.src_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &ib_spec->ipv6.mask.dst_ip,
+		       sizeof(ib_spec->ipv6.mask.dst_ip));
+		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
+				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
+		       &ib_spec->ipv6.val.dst_ip,
+		       sizeof(ib_spec->ipv6.val.dst_ip));
+
+		set_tos(headers_c, headers_v,
+			ib_spec->ipv6.mask.traffic_class,
+			ib_spec->ipv6.val.traffic_class);
+
+		if (set_proto(headers_c, headers_v,
+			      ib_spec->ipv6.mask.next_hdr,
+			      ib_spec->ipv6.val.next_hdr))
+			return -EINVAL;
+
+		set_flow_label(misc_params_c, misc_params_v,
+			       ntohl(ib_spec->ipv6.mask.flow_label),
+			       ntohl(ib_spec->ipv6.val.flow_label),
+			       ib_spec->type & IB_FLOW_SPEC_INNER);
+		break;
+	case IB_FLOW_SPEC_ESP:
+		if (ib_spec->esp.mask.seq)
+			return -EOPNOTSUPP;
+
+		MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi,
+			 ntohl(ib_spec->esp.mask.spi));
+		MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi,
+			 ntohl(ib_spec->esp.val.spi));
+		break;
+	case IB_FLOW_SPEC_TCP:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
+					 LAST_TCP_UDP_FIELD))
+			return -EOPNOTSUPP;
+
+		if (set_proto(headers_c, headers_v, 0xff, IPPROTO_TCP))
+			return -EINVAL;
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
+			 ntohs(ib_spec->tcp_udp.mask.src_port));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport,
+			 ntohs(ib_spec->tcp_udp.val.src_port));
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_dport,
+			 ntohs(ib_spec->tcp_udp.mask.dst_port));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport,
+			 ntohs(ib_spec->tcp_udp.val.dst_port));
+		break;
+	case IB_FLOW_SPEC_UDP:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
+					 LAST_TCP_UDP_FIELD))
+			return -EOPNOTSUPP;
+
+		if (set_proto(headers_c, headers_v, 0xff, IPPROTO_UDP))
+			return -EINVAL;
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
+			 ntohs(ib_spec->tcp_udp.mask.src_port));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport,
+			 ntohs(ib_spec->tcp_udp.val.src_port));
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport,
+			 ntohs(ib_spec->tcp_udp.mask.dst_port));
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
+			 ntohs(ib_spec->tcp_udp.val.dst_port));
+		break;
+	case IB_FLOW_SPEC_GRE:
+		if (ib_spec->gre.mask.c_ks_res0_ver)
+			return -EOPNOTSUPP;
+
+		if (set_proto(headers_c, headers_v, 0xff, IPPROTO_GRE))
+			return -EINVAL;
+
+		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
+			 0xff);
+		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
+			 IPPROTO_GRE);
+
+		MLX5_SET(fte_match_set_misc, misc_params_c, gre_protocol,
+			 ntohs(ib_spec->gre.mask.protocol));
+		MLX5_SET(fte_match_set_misc, misc_params_v, gre_protocol,
+			 ntohs(ib_spec->gre.val.protocol));
+
+		memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c,
+				    gre_key.nvgre.hi),
+		       &ib_spec->gre.mask.key,
+		       sizeof(ib_spec->gre.mask.key));
+		memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_v,
+				    gre_key.nvgre.hi),
+		       &ib_spec->gre.val.key,
+		       sizeof(ib_spec->gre.val.key));
+		break;
+	case IB_FLOW_SPEC_MPLS:
+		switch (prev_type) {
+		case IB_FLOW_SPEC_UDP:
+			if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+						   ft_field_support.outer_first_mpls_over_udp),
+						   &ib_spec->mpls.mask.tag))
+				return -EOPNOTSUPP;
+
+			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
+					    outer_first_mpls_over_udp),
+			       &ib_spec->mpls.val.tag,
+			       sizeof(ib_spec->mpls.val.tag));
+			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
+					    outer_first_mpls_over_udp),
+			       &ib_spec->mpls.mask.tag,
+			       sizeof(ib_spec->mpls.mask.tag));
+			break;
+		case IB_FLOW_SPEC_GRE:
+			if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+						   ft_field_support.outer_first_mpls_over_gre),
+						   &ib_spec->mpls.mask.tag))
+				return -EOPNOTSUPP;
+
+			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
+					    outer_first_mpls_over_gre),
+			       &ib_spec->mpls.val.tag,
+			       sizeof(ib_spec->mpls.val.tag));
+			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
+					    outer_first_mpls_over_gre),
+			       &ib_spec->mpls.mask.tag,
+			       sizeof(ib_spec->mpls.mask.tag));
+			break;
+		default:
+			if (ib_spec->type & IB_FLOW_SPEC_INNER) {
+				if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+							   ft_field_support.inner_first_mpls),
+							   &ib_spec->mpls.mask.tag))
+					return -EOPNOTSUPP;
+
+				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
+						    inner_first_mpls),
+				       &ib_spec->mpls.val.tag,
+				       sizeof(ib_spec->mpls.val.tag));
+				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
+						    inner_first_mpls),
+				       &ib_spec->mpls.mask.tag,
+				       sizeof(ib_spec->mpls.mask.tag));
+			} else {
+				if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+							   ft_field_support.outer_first_mpls),
+							   &ib_spec->mpls.mask.tag))
+					return -EOPNOTSUPP;
+
+				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
+						    outer_first_mpls),
+				       &ib_spec->mpls.val.tag,
+				       sizeof(ib_spec->mpls.val.tag));
+				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
+						    outer_first_mpls),
+				       &ib_spec->mpls.mask.tag,
+				       sizeof(ib_spec->mpls.mask.tag));
+			}
+		}
+		break;
+	case IB_FLOW_SPEC_VXLAN_TUNNEL:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask,
+					 LAST_TUNNEL_FIELD))
+			return -EOPNOTSUPP;
+
+		MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni,
+			 ntohl(ib_spec->tunnel.mask.tunnel_id));
+		MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni,
+			 ntohl(ib_spec->tunnel.val.tunnel_id));
+		break;
+	case IB_FLOW_SPEC_ACTION_TAG:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag,
+					 LAST_FLOW_TAG_FIELD))
+			return -EOPNOTSUPP;
+		if (ib_spec->flow_tag.tag_id >= BIT(24))
+			return -EINVAL;
+
+		flow_context->flow_tag = ib_spec->flow_tag.tag_id;
+		flow_context->flags |= FLOW_CONTEXT_HAS_TAG;
+		break;
+	case IB_FLOW_SPEC_ACTION_DROP:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
+					 LAST_DROP_FIELD))
+			return -EOPNOTSUPP;
+		action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
+		break;
+	case IB_FLOW_SPEC_ACTION_HANDLE:
+		ret = parse_flow_flow_action(to_mflow_act(ib_spec->action.act),
+			flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS, action);
+		if (ret)
+			return ret;
+		break;
+	case IB_FLOW_SPEC_ACTION_COUNT:
+		if (FIELDS_NOT_SUPPORTED(ib_spec->flow_count,
+					 LAST_COUNTERS_FIELD))
+			return -EOPNOTSUPP;
+
+		/* for now support only one counters spec per flow */
+		if (action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
+			return -EINVAL;
+
+		action->counters = ib_spec->flow_count.counters;
+		action->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/* If a flow could catch both multicast and unicast packets,
+ * it won't fall into the multicast flow steering table and this rule
+ * could steal other multicast packets.
+ */
+static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr)
+{
+	union ib_flow_spec *flow_spec;
+
+	if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
+	    ib_attr->num_of_specs < 1)
+		return false;
+
+	flow_spec = (union ib_flow_spec *)(ib_attr + 1);
+	if (flow_spec->type == IB_FLOW_SPEC_IPV4) {
+		struct ib_flow_spec_ipv4 *ipv4_spec;
+
+		ipv4_spec = (struct ib_flow_spec_ipv4 *)flow_spec;
+		if (ipv4_is_multicast(ipv4_spec->val.dst_ip))
+			return true;
+
+		return false;
+	}
+
+	if (flow_spec->type == IB_FLOW_SPEC_ETH) {
+		struct ib_flow_spec_eth *eth_spec;
+
+		eth_spec = (struct ib_flow_spec_eth *)flow_spec;
+		return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
+		       is_multicast_ether_addr(eth_spec->val.dst_mac);
+	}
+
+	return false;
+}
+
+enum valid_spec {
+	VALID_SPEC_INVALID,
+	VALID_SPEC_VALID,
+	VALID_SPEC_NA,
+};
+
+static enum valid_spec
+is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev,
+		     const struct mlx5_flow_spec *spec,
+		     const struct mlx5_flow_act *flow_act,
+		     bool egress)
+{
+	const u32 *match_c = spec->match_criteria;
+	bool is_crypto =
+		(flow_act->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
+				     MLX5_FLOW_CONTEXT_ACTION_DECRYPT));
+	bool is_ipsec = mlx5_fs_is_ipsec_flow(match_c);
+	bool is_drop = flow_act->action & MLX5_FLOW_CONTEXT_ACTION_DROP;
+
+	/*
+	 * Currently only crypto is supported in egress, when regular egress
+	 * rules would be supported, always return VALID_SPEC_NA.
+	 */
+	if (!is_crypto)
+		return VALID_SPEC_NA;
+
+	return is_crypto && is_ipsec &&
+		(!egress || (!is_drop &&
+			     !(spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG))) ?
+		VALID_SPEC_VALID : VALID_SPEC_INVALID;
+}
+
+static bool is_valid_spec(struct mlx5_core_dev *mdev,
+			  const struct mlx5_flow_spec *spec,
+			  const struct mlx5_flow_act *flow_act,
+			  bool egress)
+{
+	/* We curretly only support ipsec egress flow */
+	return is_valid_esp_aes_gcm(mdev, spec, flow_act, egress) != VALID_SPEC_INVALID;
+}
+
+static bool is_valid_ethertype(struct mlx5_core_dev *mdev,
+			       const struct ib_flow_attr *flow_attr,
+			       bool check_inner)
+{
+	union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
+	int match_ipv = check_inner ?
+			MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.inner_ip_version) :
+			MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
+					ft_field_support.outer_ip_version);
+	int inner_bit = check_inner ? IB_FLOW_SPEC_INNER : 0;
+	bool ipv4_spec_valid, ipv6_spec_valid;
+	unsigned int ip_spec_type = 0;
+	bool has_ethertype = false;
+	unsigned int spec_index;
+	bool mask_valid = true;
+	u16 eth_type = 0;
+	bool type_valid;
+
+	/* Validate that ethertype is correct */
+	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
+		if ((ib_spec->type == (IB_FLOW_SPEC_ETH | inner_bit)) &&
+		    ib_spec->eth.mask.ether_type) {
+			mask_valid = (ib_spec->eth.mask.ether_type ==
+				      htons(0xffff));
+			has_ethertype = true;
+			eth_type = ntohs(ib_spec->eth.val.ether_type);
+		} else if ((ib_spec->type == (IB_FLOW_SPEC_IPV4 | inner_bit)) ||
+			   (ib_spec->type == (IB_FLOW_SPEC_IPV6 | inner_bit))) {
+			ip_spec_type = ib_spec->type;
+		}
+		ib_spec = (void *)ib_spec + ib_spec->size;
+	}
+
+	type_valid = (!has_ethertype) || (!ip_spec_type);
+	if (!type_valid && mask_valid) {
+		ipv4_spec_valid = (eth_type == ETH_P_IP) &&
+			(ip_spec_type == (IB_FLOW_SPEC_IPV4 | inner_bit));
+		ipv6_spec_valid = (eth_type == ETH_P_IPV6) &&
+			(ip_spec_type == (IB_FLOW_SPEC_IPV6 | inner_bit));
+
+		type_valid = (ipv4_spec_valid) || (ipv6_spec_valid) ||
+			     (((eth_type == ETH_P_MPLS_UC) ||
+			       (eth_type == ETH_P_MPLS_MC)) && match_ipv);
+	}
+
+	return type_valid;
+}
+
+static bool is_valid_attr(struct mlx5_core_dev *mdev,
+			  const struct ib_flow_attr *flow_attr)
+{
+	return is_valid_ethertype(mdev, flow_attr, false) &&
+	       is_valid_ethertype(mdev, flow_attr, true);
+}
+
+static void put_flow_table(struct mlx5_ib_dev *dev,
+			   struct mlx5_ib_flow_prio *prio, bool ft_added)
+{
+	prio->refcount -= !!ft_added;
+	if (!prio->refcount) {
+		mlx5_destroy_flow_table(prio->flow_table);
+		prio->flow_table = NULL;
+	}
+}
+
+static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
+{
+	struct mlx5_ib_flow_handler *handler = container_of(flow_id,
+							  struct mlx5_ib_flow_handler,
+							  ibflow);
+	struct mlx5_ib_flow_handler *iter, *tmp;
+	struct mlx5_ib_dev *dev = handler->dev;
+
+	mutex_lock(&dev->flow_db->lock);
+
+	list_for_each_entry_safe(iter, tmp, &handler->list, list) {
+		mlx5_del_flow_rules(iter->rule);
+		put_flow_table(dev, iter->prio, true);
+		list_del(&iter->list);
+		kfree(iter);
+	}
+
+	mlx5_del_flow_rules(handler->rule);
+	put_flow_table(dev, handler->prio, true);
+	mlx5_ib_counters_clear_description(handler->ibcounters);
+	mutex_unlock(&dev->flow_db->lock);
+	if (handler->flow_matcher)
+		atomic_dec(&handler->flow_matcher->usecnt);
+	kfree(handler);
+
+	return 0;
+}
+
+static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
+{
+	priority *= 2;
+	if (!dont_trap)
+		priority++;
+	return priority;
+}
+
+enum flow_table_type {
+	MLX5_IB_FT_RX,
+	MLX5_IB_FT_TX
+};
+
+#define MLX5_FS_MAX_TYPES	 6
+#define MLX5_FS_MAX_ENTRIES	 BIT(16)
+
+static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
+					   struct mlx5_ib_flow_prio *prio,
+					   int priority,
+					   int num_entries, int num_groups,
+					   u32 flags)
+{
+	struct mlx5_flow_table_attr ft_attr = {};
+	struct mlx5_flow_table *ft;
+
+	ft_attr.prio = priority;
+	ft_attr.max_fte = num_entries;
+	ft_attr.flags = flags;
+	ft_attr.autogroup.max_num_groups = num_groups;
+	ft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
+	if (IS_ERR(ft))
+		return ERR_CAST(ft);
+
+	prio->flow_table = ft;
+	prio->refcount = 0;
+	return prio;
+}
+
+static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
+						struct ib_flow_attr *flow_attr,
+						enum flow_table_type ft_type)
+{
+	bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
+	struct mlx5_flow_namespace *ns = NULL;
+	enum mlx5_flow_namespace_type fn_type;
+	struct mlx5_ib_flow_prio *prio;
+	struct mlx5_flow_table *ft;
+	int max_table_size;
+	int num_entries;
+	int num_groups;
+	bool esw_encap;
+	u32 flags = 0;
+	int priority;
+
+	max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+						       log_max_ft_size));
+	esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
+		DEVLINK_ESWITCH_ENCAP_MODE_NONE;
+	switch (flow_attr->type) {
+	case IB_FLOW_ATTR_NORMAL:
+		if (flow_is_multicast_only(flow_attr) && !dont_trap)
+			priority = MLX5_IB_FLOW_MCAST_PRIO;
+		else
+			priority = ib_prio_to_core_prio(flow_attr->priority,
+							dont_trap);
+		if (ft_type == MLX5_IB_FT_RX) {
+			fn_type = MLX5_FLOW_NAMESPACE_BYPASS;
+			prio = &dev->flow_db->prios[priority];
+			if (!dev->is_rep && !esw_encap &&
+			    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap))
+				flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
+			if (!dev->is_rep && !esw_encap &&
+			    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+						      reformat_l3_tunnel_to_l2))
+				flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+		} else {
+			max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_TX(
+				dev->mdev, log_max_ft_size));
+			fn_type = MLX5_FLOW_NAMESPACE_EGRESS;
+			prio = &dev->flow_db->egress_prios[priority];
+			if (!dev->is_rep && !esw_encap &&
+			    MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat))
+				flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+		}
+		ns = mlx5_get_flow_namespace(dev->mdev, fn_type);
+		num_entries = MLX5_FS_MAX_ENTRIES;
+		num_groups = MLX5_FS_MAX_TYPES;
+		break;
+	case IB_FLOW_ATTR_ALL_DEFAULT:
+	case IB_FLOW_ATTR_MC_DEFAULT:
+		ns = mlx5_get_flow_namespace(dev->mdev,
+					     MLX5_FLOW_NAMESPACE_LEFTOVERS);
+		build_leftovers_ft_param(&priority, &num_entries, &num_groups);
+		prio = &dev->flow_db->prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
+		break;
+	case IB_FLOW_ATTR_SNIFFER:
+		if (!MLX5_CAP_FLOWTABLE(dev->mdev,
+					allow_sniffer_and_nic_rx_shared_tir))
+			return ERR_PTR(-EOPNOTSUPP);
+
+		ns = mlx5_get_flow_namespace(
+			dev->mdev, ft_type == MLX5_IB_FT_RX ?
+					   MLX5_FLOW_NAMESPACE_SNIFFER_RX :
+					   MLX5_FLOW_NAMESPACE_SNIFFER_TX);
+
+		prio = &dev->flow_db->sniffer[ft_type];
+		priority = 0;
+		num_entries = 1;
+		num_groups = 1;
+		break;
+	default:
+		break;
+	}
+
+	if (!ns)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	max_table_size = min_t(int, num_entries, max_table_size);
+
+	ft = prio->flow_table;
+	if (!ft)
+		return _get_prio(ns, prio, priority, max_table_size, num_groups,
+				 flags);
+
+	return prio;
+}
+
+static void set_underlay_qp(struct mlx5_ib_dev *dev,
+			    struct mlx5_flow_spec *spec,
+			    u32 underlay_qpn)
+{
+	void *misc_params_c = MLX5_ADDR_OF(fte_match_param,
+					   spec->match_criteria,
+					   misc_parameters);
+	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+					   misc_parameters);
+
+	if (underlay_qpn &&
+	    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+				      ft_field_support.bth_dst_qp)) {
+		MLX5_SET(fte_match_set_misc,
+			 misc_params_v, bth_dst_qp, underlay_qpn);
+		MLX5_SET(fte_match_set_misc,
+			 misc_params_c, bth_dst_qp, 0xffffff);
+	}
+}
+
+static void mlx5_ib_set_rule_source_port(struct mlx5_ib_dev *dev,
+					 struct mlx5_flow_spec *spec,
+					 struct mlx5_eswitch_rep *rep)
+{
+	struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
+	void *misc;
+
+	if (mlx5_eswitch_vport_match_metadata_enabled(esw)) {
+		misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    misc_parameters_2);
+
+		MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
+			 mlx5_eswitch_get_vport_metadata_for_match(esw,
+								   rep->vport));
+		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				    misc_parameters_2);
+
+		MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
+			 mlx5_eswitch_get_vport_metadata_mask());
+	} else {
+		misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+				    misc_parameters);
+
+		MLX5_SET(fte_match_set_misc, misc, source_port, rep->vport);
+
+		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+				    misc_parameters);
+
+		MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
+	}
+}
+
+static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
+						      struct mlx5_ib_flow_prio *ft_prio,
+						      const struct ib_flow_attr *flow_attr,
+						      struct mlx5_flow_destination *dst,
+						      u32 underlay_qpn,
+						      struct mlx5_ib_create_flow *ucmd)
+{
+	struct mlx5_flow_table	*ft = ft_prio->flow_table;
+	struct mlx5_ib_flow_handler *handler;
+	struct mlx5_flow_act flow_act = {};
+	struct mlx5_flow_spec *spec;
+	struct mlx5_flow_destination dest_arr[2] = {};
+	struct mlx5_flow_destination *rule_dst = dest_arr;
+	const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
+	unsigned int spec_index;
+	u32 prev_type = 0;
+	int err = 0;
+	int dest_num = 0;
+	bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
+
+	if (!is_valid_attr(dev->mdev, flow_attr))
+		return ERR_PTR(-EINVAL);
+
+	if (dev->is_rep && is_egress)
+		return ERR_PTR(-EINVAL);
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
+	if (!handler || !spec) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	INIT_LIST_HEAD(&handler->list);
+
+	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
+		err = parse_flow_attr(dev->mdev, spec,
+				      ib_flow, flow_attr, &flow_act,
+				      prev_type);
+		if (err < 0)
+			goto free;
+
+		prev_type = ((union ib_flow_spec *)ib_flow)->type;
+		ib_flow += ((union ib_flow_spec *)ib_flow)->size;
+	}
+
+	if (dst && !(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP)) {
+		memcpy(&dest_arr[0], dst, sizeof(*dst));
+		dest_num++;
+	}
+
+	if (!flow_is_multicast_only(flow_attr))
+		set_underlay_qp(dev, spec, underlay_qpn);
+
+	if (dev->is_rep && flow_attr->type != IB_FLOW_ATTR_SNIFFER) {
+		struct mlx5_eswitch_rep *rep;
+
+		rep = dev->port[flow_attr->port - 1].rep;
+		if (!rep) {
+			err = -EINVAL;
+			goto free;
+		}
+
+		mlx5_ib_set_rule_source_port(dev, spec, rep);
+	}
+
+	spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
+
+	if (is_egress &&
+	    !is_valid_spec(dev->mdev, spec, &flow_act, is_egress)) {
+		err = -EINVAL;
+		goto free;
+	}
+
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		struct mlx5_ib_mcounters *mcounters;
+
+		err = mlx5_ib_flow_counters_set_data(flow_act.counters, ucmd);
+		if (err)
+			goto free;
+
+		mcounters = to_mcounters(flow_act.counters);
+		handler->ibcounters = flow_act.counters;
+		dest_arr[dest_num].type =
+			MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		dest_arr[dest_num].counter_id =
+			mlx5_fc_id(mcounters->hw_cntrs_hndl);
+		dest_num++;
+	}
+
+	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) {
+		if (!dest_num)
+			rule_dst = NULL;
+	} else {
+		if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)
+			flow_act.action |=
+				MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
+		if (is_egress)
+			flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
+		else if (dest_num)
+			flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+	}
+
+	if ((spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG)  &&
+	    (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
+	     flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
+		mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
+			     spec->flow_context.flow_tag, flow_attr->type);
+		err = -EINVAL;
+		goto free;
+	}
+	handler->rule = mlx5_add_flow_rules(ft, spec,
+					    &flow_act,
+					    rule_dst, dest_num);
+
+	if (IS_ERR(handler->rule)) {
+		err = PTR_ERR(handler->rule);
+		goto free;
+	}
+
+	ft_prio->refcount++;
+	handler->prio = ft_prio;
+	handler->dev = dev;
+
+	ft_prio->flow_table = ft;
+free:
+	if (err && handler) {
+		mlx5_ib_counters_clear_description(handler->ibcounters);
+		kfree(handler);
+	}
+	kvfree(spec);
+	return err ? ERR_PTR(err) : handler;
+}
+
+static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
+						     struct mlx5_ib_flow_prio *ft_prio,
+						     const struct ib_flow_attr *flow_attr,
+						     struct mlx5_flow_destination *dst)
+{
+	return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL);
+}
+
+enum {
+	LEFTOVERS_MC,
+	LEFTOVERS_UC,
+};
+
+static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
+							  struct mlx5_ib_flow_prio *ft_prio,
+							  struct ib_flow_attr *flow_attr,
+							  struct mlx5_flow_destination *dst)
+{
+	struct mlx5_ib_flow_handler *handler_ucast = NULL;
+	struct mlx5_ib_flow_handler *handler = NULL;
+
+	static struct {
+		struct ib_flow_attr	flow_attr;
+		struct ib_flow_spec_eth eth_flow;
+	} leftovers_specs[] = {
+		[LEFTOVERS_MC] = {
+			.flow_attr = {
+				.num_of_specs = 1,
+				.size = sizeof(leftovers_specs[0])
+			},
+			.eth_flow = {
+				.type = IB_FLOW_SPEC_ETH,
+				.size = sizeof(struct ib_flow_spec_eth),
+				.mask = {.dst_mac = {0x1} },
+				.val =  {.dst_mac = {0x1} }
+			}
+		},
+		[LEFTOVERS_UC] = {
+			.flow_attr = {
+				.num_of_specs = 1,
+				.size = sizeof(leftovers_specs[0])
+			},
+			.eth_flow = {
+				.type = IB_FLOW_SPEC_ETH,
+				.size = sizeof(struct ib_flow_spec_eth),
+				.mask = {.dst_mac = {0x1} },
+				.val = {.dst_mac = {} }
+			}
+		}
+	};
+
+	handler = create_flow_rule(dev, ft_prio,
+				   &leftovers_specs[LEFTOVERS_MC].flow_attr,
+				   dst);
+	if (!IS_ERR(handler) &&
+	    flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
+		handler_ucast = create_flow_rule(dev, ft_prio,
+						 &leftovers_specs[LEFTOVERS_UC].flow_attr,
+						 dst);
+		if (IS_ERR(handler_ucast)) {
+			mlx5_del_flow_rules(handler->rule);
+			ft_prio->refcount--;
+			kfree(handler);
+			handler = handler_ucast;
+		} else {
+			list_add(&handler_ucast->list, &handler->list);
+		}
+	}
+
+	return handler;
+}
+
+static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
+							struct mlx5_ib_flow_prio *ft_rx,
+							struct mlx5_ib_flow_prio *ft_tx,
+							struct mlx5_flow_destination *dst)
+{
+	struct mlx5_ib_flow_handler *handler_rx;
+	struct mlx5_ib_flow_handler *handler_tx;
+	int err;
+	static const struct ib_flow_attr flow_attr  = {
+		.num_of_specs = 0,
+		.type = IB_FLOW_ATTR_SNIFFER,
+		.size = sizeof(flow_attr)
+	};
+
+	handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst);
+	if (IS_ERR(handler_rx)) {
+		err = PTR_ERR(handler_rx);
+		goto err;
+	}
+
+	handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst);
+	if (IS_ERR(handler_tx)) {
+		err = PTR_ERR(handler_tx);
+		goto err_tx;
+	}
+
+	list_add(&handler_tx->list, &handler_rx->list);
+
+	return handler_rx;
+
+err_tx:
+	mlx5_del_flow_rules(handler_rx->rule);
+	ft_rx->refcount--;
+	kfree(handler_rx);
+err:
+	return ERR_PTR(err);
+}
+
+static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
+					   struct ib_flow_attr *flow_attr,
+					   struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	struct mlx5_ib_flow_handler *handler = NULL;
+	struct mlx5_flow_destination *dst = NULL;
+	struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
+	struct mlx5_ib_flow_prio *ft_prio;
+	bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
+	struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr;
+	size_t min_ucmd_sz, required_ucmd_sz;
+	int err;
+	int underlay_qpn;
+
+	if (udata && udata->inlen) {
+		min_ucmd_sz = offsetofend(struct mlx5_ib_create_flow, reserved);
+		if (udata->inlen < min_ucmd_sz)
+			return ERR_PTR(-EOPNOTSUPP);
+
+		err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz);
+		if (err)
+			return ERR_PTR(err);
+
+		/* currently supports only one counters data */
+		if (ucmd_hdr.ncounters_data > 1)
+			return ERR_PTR(-EINVAL);
+
+		required_ucmd_sz = min_ucmd_sz +
+			sizeof(struct mlx5_ib_flow_counters_data) *
+			ucmd_hdr.ncounters_data;
+		if (udata->inlen > required_ucmd_sz &&
+		    !ib_is_udata_cleared(udata, required_ucmd_sz,
+					 udata->inlen - required_ucmd_sz))
+			return ERR_PTR(-EOPNOTSUPP);
+
+		ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL);
+		if (!ucmd)
+			return ERR_PTR(-ENOMEM);
+
+		err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz);
+		if (err)
+			goto free_ucmd;
+	}
+
+	if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) {
+		err = -ENOMEM;
+		goto free_ucmd;
+	}
+
+	if (flow_attr->port > dev->num_ports ||
+	    (flow_attr->flags &
+	     ~(IB_FLOW_ATTR_FLAGS_DONT_TRAP | IB_FLOW_ATTR_FLAGS_EGRESS))) {
+		err = -EINVAL;
+		goto free_ucmd;
+	}
+
+	if (is_egress &&
+	    (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
+	     flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
+		err = -EINVAL;
+		goto free_ucmd;
+	}
+
+	dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+	if (!dst) {
+		err = -ENOMEM;
+		goto free_ucmd;
+	}
+
+	mutex_lock(&dev->flow_db->lock);
+
+	ft_prio = get_flow_table(dev, flow_attr,
+				 is_egress ? MLX5_IB_FT_TX : MLX5_IB_FT_RX);
+	if (IS_ERR(ft_prio)) {
+		err = PTR_ERR(ft_prio);
+		goto unlock;
+	}
+	if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
+		ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX);
+		if (IS_ERR(ft_prio_tx)) {
+			err = PTR_ERR(ft_prio_tx);
+			ft_prio_tx = NULL;
+			goto destroy_ft;
+		}
+	}
+
+	if (is_egress) {
+		dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT;
+	} else {
+		dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+		if (mqp->is_rss)
+			dst->tir_num = mqp->rss_qp.tirn;
+		else
+			dst->tir_num = mqp->raw_packet_qp.rq.tirn;
+	}
+
+	switch (flow_attr->type) {
+	case IB_FLOW_ATTR_NORMAL:
+		underlay_qpn = (mqp->flags & IB_QP_CREATE_SOURCE_QPN) ?
+				       mqp->underlay_qpn :
+				       0;
+		handler = _create_flow_rule(dev, ft_prio, flow_attr, dst,
+					    underlay_qpn, ucmd);
+		break;
+	case IB_FLOW_ATTR_ALL_DEFAULT:
+	case IB_FLOW_ATTR_MC_DEFAULT:
+		handler = create_leftovers_rule(dev, ft_prio, flow_attr, dst);
+		break;
+	case IB_FLOW_ATTR_SNIFFER:
+		handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst);
+		break;
+	default:
+		err = -EINVAL;
+		goto destroy_ft;
+	}
+
+	if (IS_ERR(handler)) {
+		err = PTR_ERR(handler);
+		handler = NULL;
+		goto destroy_ft;
+	}
+
+	mutex_unlock(&dev->flow_db->lock);
+	kfree(dst);
+	kfree(ucmd);
+
+	return &handler->ibflow;
+
+destroy_ft:
+	put_flow_table(dev, ft_prio, false);
+	if (ft_prio_tx)
+		put_flow_table(dev, ft_prio_tx, false);
+unlock:
+	mutex_unlock(&dev->flow_db->lock);
+	kfree(dst);
+free_ucmd:
+	kfree(ucmd);
+	return ERR_PTR(err);
+}
+
+static struct mlx5_ib_flow_prio *
+_get_flow_table(struct mlx5_ib_dev *dev,
+		struct mlx5_ib_flow_matcher *fs_matcher,
+		bool mcast)
+{
+	struct mlx5_flow_namespace *ns = NULL;
+	struct mlx5_ib_flow_prio *prio = NULL;
+	int max_table_size = 0;
+	bool esw_encap;
+	u32 flags = 0;
+	int priority;
+
+	if (mcast)
+		priority = MLX5_IB_FLOW_MCAST_PRIO;
+	else
+		priority = ib_prio_to_core_prio(fs_matcher->priority, false);
+
+	esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
+		DEVLINK_ESWITCH_ENCAP_MODE_NONE;
+	switch (fs_matcher->ns_type) {
+	case MLX5_FLOW_NAMESPACE_BYPASS:
+		max_table_size = BIT(
+			MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, log_max_ft_size));
+		if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap) && !esw_encap)
+			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
+		if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+					      reformat_l3_tunnel_to_l2) &&
+		    !esw_encap)
+			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+		break;
+	case MLX5_FLOW_NAMESPACE_EGRESS:
+		max_table_size = BIT(
+			MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, log_max_ft_size));
+		if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat) &&
+		    !esw_encap)
+			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+		break;
+	case MLX5_FLOW_NAMESPACE_FDB:
+		max_table_size = BIT(
+			MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, log_max_ft_size));
+		if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, decap) && esw_encap)
+			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
+		if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev,
+					       reformat_l3_tunnel_to_l2) &&
+		    esw_encap)
+			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+		priority = FDB_BYPASS_PATH;
+		break;
+	case MLX5_FLOW_NAMESPACE_RDMA_RX:
+		max_table_size = BIT(
+			MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev, log_max_ft_size));
+		priority = fs_matcher->priority;
+		break;
+	case MLX5_FLOW_NAMESPACE_RDMA_TX:
+		max_table_size = BIT(
+			MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, log_max_ft_size));
+		priority = fs_matcher->priority;
+		break;
+	default:
+		break;
+	}
+
+	max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES);
+
+	ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type);
+	if (!ns)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	switch (fs_matcher->ns_type) {
+	case MLX5_FLOW_NAMESPACE_BYPASS:
+		prio = &dev->flow_db->prios[priority];
+		break;
+	case MLX5_FLOW_NAMESPACE_EGRESS:
+		prio = &dev->flow_db->egress_prios[priority];
+		break;
+	case MLX5_FLOW_NAMESPACE_FDB:
+		prio = &dev->flow_db->fdb;
+		break;
+	case MLX5_FLOW_NAMESPACE_RDMA_RX:
+		prio = &dev->flow_db->rdma_rx[priority];
+		break;
+	case MLX5_FLOW_NAMESPACE_RDMA_TX:
+		prio = &dev->flow_db->rdma_tx[priority];
+		break;
+	default: return ERR_PTR(-EINVAL);
+	}
+
+	if (!prio)
+		return ERR_PTR(-EINVAL);
+
+	if (prio->flow_table)
+		return prio;
+
+	return _get_prio(ns, prio, priority, max_table_size,
+			 MLX5_FS_MAX_TYPES, flags);
+}
+
+static struct mlx5_ib_flow_handler *
+_create_raw_flow_rule(struct mlx5_ib_dev *dev,
+		      struct mlx5_ib_flow_prio *ft_prio,
+		      struct mlx5_flow_destination *dst,
+		      struct mlx5_ib_flow_matcher  *fs_matcher,
+		      struct mlx5_flow_context *flow_context,
+		      struct mlx5_flow_act *flow_act,
+		      void *cmd_in, int inlen,
+		      int dst_num)
+{
+	struct mlx5_ib_flow_handler *handler;
+	struct mlx5_flow_spec *spec;
+	struct mlx5_flow_table *ft = ft_prio->flow_table;
+	int err = 0;
+
+	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
+	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
+	if (!handler || !spec) {
+		err = -ENOMEM;
+		goto free;
+	}
+
+	INIT_LIST_HEAD(&handler->list);
+
+	memcpy(spec->match_value, cmd_in, inlen);
+	memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params,
+	       fs_matcher->mask_len);
+	spec->match_criteria_enable = fs_matcher->match_criteria_enable;
+	spec->flow_context = *flow_context;
+
+	handler->rule = mlx5_add_flow_rules(ft, spec,
+					    flow_act, dst, dst_num);
+
+	if (IS_ERR(handler->rule)) {
+		err = PTR_ERR(handler->rule);
+		goto free;
+	}
+
+	ft_prio->refcount++;
+	handler->prio = ft_prio;
+	handler->dev = dev;
+	ft_prio->flow_table = ft;
+
+free:
+	if (err)
+		kfree(handler);
+	kvfree(spec);
+	return err ? ERR_PTR(err) : handler;
+}
+
+static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher,
+				void *match_v)
+{
+	void *match_c;
+	void *match_v_set_lyr_2_4, *match_c_set_lyr_2_4;
+	void *dmac, *dmac_mask;
+	void *ipv4, *ipv4_mask;
+
+	if (!(fs_matcher->match_criteria_enable &
+	      (1 << MATCH_CRITERIA_ENABLE_OUTER_BIT)))
+		return false;
+
+	match_c = fs_matcher->matcher_mask.match_params;
+	match_v_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_v,
+					   outer_headers);
+	match_c_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_c,
+					   outer_headers);
+
+	dmac = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
+			    dmac_47_16);
+	dmac_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
+				 dmac_47_16);
+
+	if (is_multicast_ether_addr(dmac) &&
+	    is_multicast_ether_addr(dmac_mask))
+		return true;
+
+	ipv4 = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
+			    dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+
+	ipv4_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
+				 dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
+
+	if (ipv4_is_multicast(*(__be32 *)(ipv4)) &&
+	    ipv4_is_multicast(*(__be32 *)(ipv4_mask)))
+		return true;
+
+	return false;
+}
+
+static struct mlx5_ib_flow_handler *raw_fs_rule_add(
+	struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher,
+	struct mlx5_flow_context *flow_context, struct mlx5_flow_act *flow_act,
+	u32 counter_id, void *cmd_in, int inlen, int dest_id, int dest_type)
+{
+	struct mlx5_flow_destination *dst;
+	struct mlx5_ib_flow_prio *ft_prio;
+	struct mlx5_ib_flow_handler *handler;
+	int dst_num = 0;
+	bool mcast;
+	int err;
+
+	if (fs_matcher->flow_type != MLX5_IB_FLOW_TYPE_NORMAL)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO)
+		return ERR_PTR(-ENOMEM);
+
+	dst = kcalloc(2, sizeof(*dst), GFP_KERNEL);
+	if (!dst)
+		return ERR_PTR(-ENOMEM);
+
+	mcast = raw_fs_is_multicast(fs_matcher, cmd_in);
+	mutex_lock(&dev->flow_db->lock);
+
+	ft_prio = _get_flow_table(dev, fs_matcher, mcast);
+	if (IS_ERR(ft_prio)) {
+		err = PTR_ERR(ft_prio);
+		goto unlock;
+	}
+
+	switch (dest_type) {
+	case MLX5_FLOW_DESTINATION_TYPE_TIR:
+		dst[dst_num].type = dest_type;
+		dst[dst_num++].tir_num = dest_id;
+		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+		break;
+	case MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE:
+		dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM;
+		dst[dst_num++].ft_num = dest_id;
+		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
+		break;
+	case MLX5_FLOW_DESTINATION_TYPE_PORT:
+		dst[dst_num++].type = MLX5_FLOW_DESTINATION_TYPE_PORT;
+		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
+		break;
+	default:
+		break;
+	}
+
+	if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+		dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
+		dst[dst_num].counter_id = counter_id;
+		dst_num++;
+	}
+
+	handler = _create_raw_flow_rule(dev, ft_prio, dst_num ? dst : NULL,
+					fs_matcher, flow_context, flow_act,
+					cmd_in, inlen, dst_num);
+
+	if (IS_ERR(handler)) {
+		err = PTR_ERR(handler);
+		goto destroy_ft;
+	}
+
+	mutex_unlock(&dev->flow_db->lock);
+	atomic_inc(&fs_matcher->usecnt);
+	handler->flow_matcher = fs_matcher;
+
+	kfree(dst);
+
+	return handler;
+
+destroy_ft:
+	put_flow_table(dev, ft_prio, false);
+unlock:
+	mutex_unlock(&dev->flow_db->lock);
+	kfree(dst);
+
+	return ERR_PTR(err);
+}
+
+static u32 mlx5_ib_flow_action_flags_to_accel_xfrm_flags(u32 mlx5_flags)
+{
+	u32 flags = 0;
+
+	if (mlx5_flags & MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA)
+		flags |= MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA;
+
+	return flags;
+}
+
+#define MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED                             \
+	MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA
+static struct ib_flow_action *
+mlx5_ib_create_flow_action_esp(struct ib_device *device,
+			       const struct ib_flow_action_attrs_esp *attr,
+			       struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_dev *mdev = to_mdev(device);
+	struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm;
+	struct mlx5_accel_esp_xfrm_attrs accel_attrs = {};
+	struct mlx5_ib_flow_action *action;
+	u64 action_flags;
+	u64 flags;
+	int err = 0;
+
+	err = uverbs_get_flags64(
+		&action_flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
+		((MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED << 1) - 1));
+	if (err)
+		return ERR_PTR(err);
+
+	flags = mlx5_ib_flow_action_flags_to_accel_xfrm_flags(action_flags);
+
+	/* We current only support a subset of the standard features. Only a
+	 * keymat of type AES_GCM, with icv_len == 16, iv_algo == SEQ and esn
+	 * (with overlap). Full offload mode isn't supported.
+	 */
+	if (!attr->keymat || attr->replay || attr->encap ||
+	    attr->spi || attr->seq || attr->tfc_pad ||
+	    attr->hard_limit_pkts ||
+	    (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
+			     IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (attr->keymat->protocol !=
+	    IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	aes_gcm = &attr->keymat->keymat.aes_gcm;
+
+	if (aes_gcm->icv_len != 16 ||
+	    aes_gcm->iv_algo != IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	action = kmalloc(sizeof(*action), GFP_KERNEL);
+	if (!action)
+		return ERR_PTR(-ENOMEM);
+
+	action->esp_aes_gcm.ib_flags = attr->flags;
+	memcpy(&accel_attrs.keymat.aes_gcm.aes_key, &aes_gcm->aes_key,
+	       sizeof(accel_attrs.keymat.aes_gcm.aes_key));
+	accel_attrs.keymat.aes_gcm.key_len = aes_gcm->key_len * 8;
+	memcpy(&accel_attrs.keymat.aes_gcm.salt, &aes_gcm->salt,
+	       sizeof(accel_attrs.keymat.aes_gcm.salt));
+	memcpy(&accel_attrs.keymat.aes_gcm.seq_iv, &aes_gcm->iv,
+	       sizeof(accel_attrs.keymat.aes_gcm.seq_iv));
+	accel_attrs.keymat.aes_gcm.icv_len = aes_gcm->icv_len * 8;
+	accel_attrs.keymat.aes_gcm.iv_algo = MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ;
+	accel_attrs.keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM;
+
+	accel_attrs.esn = attr->esn;
+	if (attr->flags & IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED)
+		accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED;
+	if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
+		accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
+
+	if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)
+		accel_attrs.action |= MLX5_ACCEL_ESP_ACTION_ENCRYPT;
+
+	action->esp_aes_gcm.ctx =
+		mlx5_accel_esp_create_xfrm(mdev->mdev, &accel_attrs, flags);
+	if (IS_ERR(action->esp_aes_gcm.ctx)) {
+		err = PTR_ERR(action->esp_aes_gcm.ctx);
+		goto err_parse;
+	}
+
+	action->esp_aes_gcm.ib_flags = attr->flags;
+
+	return &action->ib_action;
+
+err_parse:
+	kfree(action);
+	return ERR_PTR(err);
+}
+
+static int
+mlx5_ib_modify_flow_action_esp(struct ib_flow_action *action,
+			       const struct ib_flow_action_attrs_esp *attr,
+			       struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_flow_action *maction = to_mflow_act(action);
+	struct mlx5_accel_esp_xfrm_attrs accel_attrs;
+	int err = 0;
+
+	if (attr->keymat || attr->replay || attr->encap ||
+	    attr->spi || attr->seq || attr->tfc_pad ||
+	    attr->hard_limit_pkts ||
+	    (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
+			     IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS |
+			     IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)))
+		return -EOPNOTSUPP;
+
+	/* Only the ESN value or the MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP can
+	 * be modified.
+	 */
+	if (!(maction->esp_aes_gcm.ib_flags &
+	      IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) &&
+	    attr->flags & (IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
+			   IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW))
+		return -EINVAL;
+
+	memcpy(&accel_attrs, &maction->esp_aes_gcm.ctx->attrs,
+	       sizeof(accel_attrs));
+
+	accel_attrs.esn = attr->esn;
+	if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
+		accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
+	else
+		accel_attrs.flags &= ~MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
+
+	err = mlx5_accel_esp_modify_xfrm(maction->esp_aes_gcm.ctx,
+					 &accel_attrs);
+	if (err)
+		return err;
+
+	maction->esp_aes_gcm.ib_flags &=
+		~IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
+	maction->esp_aes_gcm.ib_flags |=
+		attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
+
+	return 0;
+}
+
+static void destroy_flow_action_raw(struct mlx5_ib_flow_action *maction)
+{
+	switch (maction->flow_action_raw.sub_type) {
+	case MLX5_IB_FLOW_ACTION_MODIFY_HEADER:
+		mlx5_modify_header_dealloc(maction->flow_action_raw.dev->mdev,
+					   maction->flow_action_raw.modify_hdr);
+		break;
+	case MLX5_IB_FLOW_ACTION_PACKET_REFORMAT:
+		mlx5_packet_reformat_dealloc(maction->flow_action_raw.dev->mdev,
+					     maction->flow_action_raw.pkt_reformat);
+		break;
+	case MLX5_IB_FLOW_ACTION_DECAP:
+		break;
+	default:
+		break;
+	}
+}
+
+static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action)
+{
+	struct mlx5_ib_flow_action *maction = to_mflow_act(action);
+
+	switch (action->type) {
+	case IB_FLOW_ACTION_ESP:
+		/*
+		 * We only support aes_gcm by now, so we implicitly know this is
+		 * the underline crypto.
+		 */
+		mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx);
+		break;
+	case IB_FLOW_ACTION_UNSPECIFIED:
+		destroy_flow_action_raw(maction);
+		break;
+	default:
+		WARN_ON(true);
+		break;
+	}
+
+	kfree(maction);
+	return 0;
+}
+
+static int
+mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type,
+			     enum mlx5_flow_namespace_type *namespace)
+{
+	switch (table_type) {
+	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX:
+		*namespace = MLX5_FLOW_NAMESPACE_BYPASS;
+		break;
+	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX:
+		*namespace = MLX5_FLOW_NAMESPACE_EGRESS;
+		break;
+	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB:
+		*namespace = MLX5_FLOW_NAMESPACE_FDB;
+		break;
+	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_RX:
+		*namespace = MLX5_FLOW_NAMESPACE_RDMA_RX;
+		break;
+	case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TX:
+		*namespace = MLX5_FLOW_NAMESPACE_RDMA_TX;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static const struct uverbs_attr_spec mlx5_ib_flow_type[] = {
+	[MLX5_IB_FLOW_TYPE_NORMAL] = {
+		.type = UVERBS_ATTR_TYPE_PTR_IN,
+		.u.ptr = {
+			.len = sizeof(u16), /* data is priority */
+			.min_len = sizeof(u16),
+		}
+	},
+	[MLX5_IB_FLOW_TYPE_SNIFFER] = {
+		.type = UVERBS_ATTR_TYPE_PTR_IN,
+		UVERBS_ATTR_NO_DATA(),
+	},
+	[MLX5_IB_FLOW_TYPE_ALL_DEFAULT] = {
+		.type = UVERBS_ATTR_TYPE_PTR_IN,
+		UVERBS_ATTR_NO_DATA(),
+	},
+	[MLX5_IB_FLOW_TYPE_MC_DEFAULT] = {
+		.type = UVERBS_ATTR_TYPE_PTR_IN,
+		UVERBS_ATTR_NO_DATA(),
+	},
+};
+
+static bool is_flow_dest(void *obj, int *dest_id, int *dest_type)
+{
+	struct devx_obj *devx_obj = obj;
+	u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode);
+
+	switch (opcode) {
+	case MLX5_CMD_OP_DESTROY_TIR:
+		*dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+		*dest_id = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox,
+				    obj_id);
+		return true;
+
+	case MLX5_CMD_OP_DESTROY_FLOW_TABLE:
+		*dest_type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+		*dest_id = MLX5_GET(destroy_flow_table_in, devx_obj->dinbox,
+				    table_id);
+		return true;
+	default:
+		return false;
+	}
+}
+
+static int get_dests(struct uverbs_attr_bundle *attrs,
+		     struct mlx5_ib_flow_matcher *fs_matcher, int *dest_id,
+		     int *dest_type, struct ib_qp **qp, u32 *flags)
+{
+	bool dest_devx, dest_qp;
+	void *devx_obj;
+	int err;
+
+	dest_devx = uverbs_attr_is_valid(attrs,
+					 MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX);
+	dest_qp = uverbs_attr_is_valid(attrs,
+				       MLX5_IB_ATTR_CREATE_FLOW_DEST_QP);
+
+	*flags = 0;
+	err = uverbs_get_flags32(flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_FLAGS,
+				 MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS |
+					 MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP);
+	if (err)
+		return err;
+
+	/* Both flags are not allowed */
+	if (*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS &&
+	    *flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)
+		return -EINVAL;
+
+	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) {
+		if (dest_devx && (dest_qp || *flags))
+			return -EINVAL;
+		else if (dest_qp && *flags)
+			return -EINVAL;
+	}
+
+	/* Allow only DEVX object, drop as dest for FDB */
+	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB && !(dest_devx ||
+	     (*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)))
+		return -EINVAL;
+
+	/* Allow only DEVX object or QP as dest when inserting to RDMA_RX */
+	if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) &&
+	    ((!dest_devx && !dest_qp) || (dest_devx && dest_qp)))
+		return -EINVAL;
+
+	*qp = NULL;
+	if (dest_devx) {
+		devx_obj =
+			uverbs_attr_get_obj(attrs,
+					    MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX);
+
+		/* Verify that the given DEVX object is a flow
+		 * steering destination.
+		 */
+		if (!is_flow_dest(devx_obj, dest_id, dest_type))
+			return -EINVAL;
+		/* Allow only flow table as dest when inserting to FDB or RDMA_RX */
+		if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB ||
+		     fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) &&
+		    *dest_type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
+			return -EINVAL;
+	} else if (dest_qp) {
+		struct mlx5_ib_qp *mqp;
+
+		*qp = uverbs_attr_get_obj(attrs,
+					  MLX5_IB_ATTR_CREATE_FLOW_DEST_QP);
+		if (IS_ERR(*qp))
+			return PTR_ERR(*qp);
+
+		if ((*qp)->qp_type != IB_QPT_RAW_PACKET)
+			return -EINVAL;
+
+		mqp = to_mqp(*qp);
+		if (mqp->is_rss)
+			*dest_id = mqp->rss_qp.tirn;
+		else
+			*dest_id = mqp->raw_packet_qp.rq.tirn;
+		*dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR;
+	} else if ((fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS ||
+		    fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) &&
+		   !(*flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)) {
+		*dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT;
+	}
+
+	if (*dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR &&
+	    (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS ||
+	     fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX))
+		return -EINVAL;
+
+	return 0;
+}
+
+static bool is_flow_counter(void *obj, u32 offset, u32 *counter_id)
+{
+	struct devx_obj *devx_obj = obj;
+	u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode);
+
+	if (opcode == MLX5_CMD_OP_DEALLOC_FLOW_COUNTER) {
+
+		if (offset && offset >= devx_obj->flow_counter_bulk_size)
+			return false;
+
+		*counter_id = MLX5_GET(dealloc_flow_counter_in,
+				       devx_obj->dinbox,
+				       flow_counter_id);
+		*counter_id += offset;
+		return true;
+	}
+
+	return false;
+}
+
+#define MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS 2
+static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_flow_context flow_context = {.flow_tag =
+		MLX5_FS_DEFAULT_FLOW_TAG};
+	u32 *offset_attr, offset = 0, counter_id = 0;
+	int dest_id, dest_type = -1, inlen, len, ret, i;
+	struct mlx5_ib_flow_handler *flow_handler;
+	struct mlx5_ib_flow_matcher *fs_matcher;
+	struct ib_uobject **arr_flow_actions;
+	struct ib_uflow_resources *uflow_res;
+	struct mlx5_flow_act flow_act = {};
+	struct ib_qp *qp = NULL;
+	void *devx_obj, *cmd_in;
+	struct ib_uobject *uobj;
+	struct mlx5_ib_dev *dev;
+	u32 flags;
+
+	if (!capable(CAP_NET_RAW))
+		return -EPERM;
+
+	fs_matcher = uverbs_attr_get_obj(attrs,
+					 MLX5_IB_ATTR_CREATE_FLOW_MATCHER);
+	uobj =  uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE);
+	dev = mlx5_udata_to_mdev(&attrs->driver_udata);
+
+	if (get_dests(attrs, fs_matcher, &dest_id, &dest_type, &qp, &flags))
+		return -EINVAL;
+
+	if (flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS)
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_NS;
+
+	if (flags & MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP)
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
+
+	len = uverbs_attr_get_uobjs_arr(attrs,
+		MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions);
+	if (len) {
+		devx_obj = arr_flow_actions[0]->object;
+
+		if (uverbs_attr_is_valid(attrs,
+					 MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET)) {
+
+			int num_offsets = uverbs_attr_ptr_get_array_size(
+				attrs,
+				MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET,
+				sizeof(u32));
+
+			if (num_offsets != 1)
+				return -EINVAL;
+
+			offset_attr = uverbs_attr_get_alloced_ptr(
+				attrs,
+				MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET);
+			offset = *offset_attr;
+		}
+
+		if (!is_flow_counter(devx_obj, offset, &counter_id))
+			return -EINVAL;
+
+		flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
+	}
+
+	cmd_in = uverbs_attr_get_alloced_ptr(
+		attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE);
+	inlen = uverbs_attr_get_len(attrs,
+				    MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE);
+
+	uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS);
+	if (!uflow_res)
+		return -ENOMEM;
+
+	len = uverbs_attr_get_uobjs_arr(attrs,
+		MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions);
+	for (i = 0; i < len; i++) {
+		struct mlx5_ib_flow_action *maction =
+			to_mflow_act(arr_flow_actions[i]->object);
+
+		ret = parse_flow_flow_action(maction, false, &flow_act);
+		if (ret)
+			goto err_out;
+		flow_resources_add(uflow_res, IB_FLOW_SPEC_ACTION_HANDLE,
+				   arr_flow_actions[i]->object);
+	}
+
+	ret = uverbs_copy_from(&flow_context.flow_tag, attrs,
+			       MLX5_IB_ATTR_CREATE_FLOW_TAG);
+	if (!ret) {
+		if (flow_context.flow_tag >= BIT(24)) {
+			ret = -EINVAL;
+			goto err_out;
+		}
+		flow_context.flags |= FLOW_CONTEXT_HAS_TAG;
+	}
+
+	flow_handler =
+		raw_fs_rule_add(dev, fs_matcher, &flow_context, &flow_act,
+				counter_id, cmd_in, inlen, dest_id, dest_type);
+	if (IS_ERR(flow_handler)) {
+		ret = PTR_ERR(flow_handler);
+		goto err_out;
+	}
+
+	ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, uflow_res);
+
+	return 0;
+err_out:
+	ib_uverbs_flow_resources_free(uflow_res);
+	return ret;
+}
+
+static int flow_matcher_cleanup(struct ib_uobject *uobject,
+				enum rdma_remove_reason why,
+				struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_flow_matcher *obj = uobject->object;
+	int ret;
+
+	ret = ib_destroy_usecnt(&obj->usecnt, why, uobject);
+	if (ret)
+		return ret;
+
+	kfree(obj);
+	return 0;
+}
+
+static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs,
+			      struct mlx5_ib_flow_matcher *obj)
+{
+	enum mlx5_ib_uapi_flow_table_type ft_type =
+		MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX;
+	u32 flags;
+	int err;
+
+	/* New users should use MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE and older
+	 * users should switch to it. We leave this to not break userspace
+	 */
+	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE) &&
+	    uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS))
+		return -EINVAL;
+
+	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE)) {
+		err = uverbs_get_const(&ft_type, attrs,
+				       MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE);
+		if (err)
+			return err;
+
+		err = mlx5_ib_ft_type_to_namespace(ft_type, &obj->ns_type);
+		if (err)
+			return err;
+
+		return 0;
+	}
+
+	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS)) {
+		err = uverbs_get_flags32(&flags, attrs,
+					 MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS,
+					 IB_FLOW_ATTR_FLAGS_EGRESS);
+		if (err)
+			return err;
+
+		if (flags) {
+			mlx5_ib_ft_type_to_namespace(
+				MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX,
+				&obj->ns_type);
+			return 0;
+		}
+	}
+
+	obj->ns_type = MLX5_FLOW_NAMESPACE_BYPASS;
+
+	return 0;
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+		attrs, MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE);
+	struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata);
+	struct mlx5_ib_flow_matcher *obj;
+	int err;
+
+	obj = kzalloc(sizeof(struct mlx5_ib_flow_matcher), GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	obj->mask_len = uverbs_attr_get_len(
+		attrs, MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK);
+	err = uverbs_copy_from(&obj->matcher_mask,
+			       attrs,
+			       MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK);
+	if (err)
+		goto end;
+
+	obj->flow_type = uverbs_attr_get_enum_id(
+		attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE);
+
+	if (obj->flow_type == MLX5_IB_FLOW_TYPE_NORMAL) {
+		err = uverbs_copy_from(&obj->priority,
+				       attrs,
+				       MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE);
+		if (err)
+			goto end;
+	}
+
+	err = uverbs_copy_from(&obj->match_criteria_enable,
+			       attrs,
+			       MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA);
+	if (err)
+		goto end;
+
+	err = mlx5_ib_matcher_ns(attrs, obj);
+	if (err)
+		goto end;
+
+	if (obj->ns_type == MLX5_FLOW_NAMESPACE_FDB &&
+	    mlx5_eswitch_mode(dev->mdev->priv.eswitch) !=
+			      MLX5_ESWITCH_OFFLOADS) {
+		err = -EINVAL;
+		goto end;
+	}
+
+	uobj->object = obj;
+	obj->mdev = dev->mdev;
+	atomic_set(&obj->usecnt, 0);
+	return 0;
+
+end:
+	kfree(obj);
+	return err;
+}
+
+static struct ib_flow_action *
+mlx5_ib_create_modify_header(struct mlx5_ib_dev *dev,
+			     enum mlx5_ib_uapi_flow_table_type ft_type,
+			     u8 num_actions, void *in)
+{
+	enum mlx5_flow_namespace_type namespace;
+	struct mlx5_ib_flow_action *maction;
+	int ret;
+
+	ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace);
+	if (ret)
+		return ERR_PTR(-EINVAL);
+
+	maction = kzalloc(sizeof(*maction), GFP_KERNEL);
+	if (!maction)
+		return ERR_PTR(-ENOMEM);
+
+	maction->flow_action_raw.modify_hdr =
+		mlx5_modify_header_alloc(dev->mdev, namespace, num_actions, in);
+
+	if (IS_ERR(maction->flow_action_raw.modify_hdr)) {
+		ret = PTR_ERR(maction->flow_action_raw.modify_hdr);
+		kfree(maction);
+		return ERR_PTR(ret);
+	}
+	maction->flow_action_raw.sub_type =
+		MLX5_IB_FLOW_ACTION_MODIFY_HEADER;
+	maction->flow_action_raw.dev = dev;
+
+	return &maction->ib_action;
+}
+
+static bool mlx5_ib_modify_header_supported(struct mlx5_ib_dev *dev)
+{
+	return MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
+					 max_modify_header_actions) ||
+	       MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev,
+					 max_modify_header_actions) ||
+	       MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev,
+					 max_modify_header_actions);
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+		attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE);
+	struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata);
+	enum mlx5_ib_uapi_flow_table_type ft_type;
+	struct ib_flow_action *action;
+	int num_actions;
+	void *in;
+	int ret;
+
+	if (!mlx5_ib_modify_header_supported(mdev))
+		return -EOPNOTSUPP;
+
+	in = uverbs_attr_get_alloced_ptr(attrs,
+		MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM);
+
+	num_actions = uverbs_attr_ptr_get_array_size(
+		attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM,
+		MLX5_UN_SZ_BYTES(set_add_copy_action_in_auto));
+	if (num_actions < 0)
+		return num_actions;
+
+	ret = uverbs_get_const(&ft_type, attrs,
+			       MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE);
+	if (ret)
+		return ret;
+	action = mlx5_ib_create_modify_header(mdev, ft_type, num_actions, in);
+	if (IS_ERR(action))
+		return PTR_ERR(action);
+
+	uverbs_flow_action_fill_action(action, uobj, &mdev->ib_dev,
+				       IB_FLOW_ACTION_UNSPECIFIED);
+
+	return 0;
+}
+
+static bool mlx5_ib_flow_action_packet_reformat_valid(struct mlx5_ib_dev *ibdev,
+						      u8 packet_reformat_type,
+						      u8 ft_type)
+{
+	switch (packet_reformat_type) {
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL:
+		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX)
+			return MLX5_CAP_FLOWTABLE(ibdev->mdev,
+						  encap_general_header);
+		break;
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL:
+		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX)
+			return MLX5_CAP_FLOWTABLE_NIC_TX(ibdev->mdev,
+				reformat_l2_to_l3_tunnel);
+		break;
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2:
+		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX)
+			return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev,
+				reformat_l3_tunnel_to_l2);
+		break;
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2:
+		if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX)
+			return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev, decap);
+		break;
+	default:
+		break;
+	}
+
+	return false;
+}
+
+static int mlx5_ib_dv_to_prm_packet_reforamt_type(u8 dv_prt, u8 *prm_prt)
+{
+	switch (dv_prt) {
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL:
+		*prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL;
+		break;
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2:
+		*prm_prt = MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2;
+		break;
+	case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL:
+		*prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int mlx5_ib_flow_action_create_packet_reformat_ctx(
+	struct mlx5_ib_dev *dev,
+	struct mlx5_ib_flow_action *maction,
+	u8 ft_type, u8 dv_prt,
+	void *in, size_t len)
+{
+	enum mlx5_flow_namespace_type namespace;
+	u8 prm_prt;
+	int ret;
+
+	ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace);
+	if (ret)
+		return ret;
+
+	ret = mlx5_ib_dv_to_prm_packet_reforamt_type(dv_prt, &prm_prt);
+	if (ret)
+		return ret;
+
+	maction->flow_action_raw.pkt_reformat =
+		mlx5_packet_reformat_alloc(dev->mdev, prm_prt, len,
+					   in, namespace);
+	if (IS_ERR(maction->flow_action_raw.pkt_reformat)) {
+		ret = PTR_ERR(maction->flow_action_raw.pkt_reformat);
+		return ret;
+	}
+
+	maction->flow_action_raw.sub_type =
+		MLX5_IB_FLOW_ACTION_PACKET_REFORMAT;
+	maction->flow_action_raw.dev = dev;
+
+	return 0;
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs,
+		MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE);
+	struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata);
+	enum mlx5_ib_uapi_flow_action_packet_reformat_type dv_prt;
+	enum mlx5_ib_uapi_flow_table_type ft_type;
+	struct mlx5_ib_flow_action *maction;
+	int ret;
+
+	ret = uverbs_get_const(&ft_type, attrs,
+			       MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE);
+	if (ret)
+		return ret;
+
+	ret = uverbs_get_const(&dv_prt, attrs,
+			       MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE);
+	if (ret)
+		return ret;
+
+	if (!mlx5_ib_flow_action_packet_reformat_valid(mdev, dv_prt, ft_type))
+		return -EOPNOTSUPP;
+
+	maction = kzalloc(sizeof(*maction), GFP_KERNEL);
+	if (!maction)
+		return -ENOMEM;
+
+	if (dv_prt ==
+	    MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2) {
+		maction->flow_action_raw.sub_type =
+			MLX5_IB_FLOW_ACTION_DECAP;
+		maction->flow_action_raw.dev = mdev;
+	} else {
+		void *in;
+		int len;
+
+		in = uverbs_attr_get_alloced_ptr(attrs,
+			MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF);
+		if (IS_ERR(in)) {
+			ret = PTR_ERR(in);
+			goto free_maction;
+		}
+
+		len = uverbs_attr_get_len(attrs,
+			MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF);
+
+		ret = mlx5_ib_flow_action_create_packet_reformat_ctx(mdev,
+			maction, ft_type, dv_prt, in, len);
+		if (ret)
+			goto free_maction;
+	}
+
+	uverbs_flow_action_fill_action(&maction->ib_action, uobj, &mdev->ib_dev,
+				       IB_FLOW_ACTION_UNSPECIFIED);
+	return 0;
+
+free_maction:
+	kfree(maction);
+	return ret;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_CREATE_FLOW,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE,
+			UVERBS_OBJECT_FLOW,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(
+		MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE,
+		UVERBS_ATTR_SIZE(1, sizeof(struct mlx5_ib_match_params)),
+		UA_MANDATORY,
+		UA_ALLOC_AND_COPY),
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_MATCHER,
+			MLX5_IB_OBJECT_FLOW_MATCHER,
+			UVERBS_ACCESS_READ,
+			UA_MANDATORY),
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_QP,
+			UVERBS_OBJECT_QP,
+			UVERBS_ACCESS_READ),
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX,
+			MLX5_IB_OBJECT_DEVX_OBJ,
+			UVERBS_ACCESS_READ),
+	UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS,
+			     UVERBS_OBJECT_FLOW_ACTION,
+			     UVERBS_ACCESS_READ, 1,
+			     MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS,
+			     UA_OPTIONAL),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_TAG,
+			   UVERBS_ATTR_TYPE(u32),
+			   UA_OPTIONAL),
+	UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX,
+			     MLX5_IB_OBJECT_DEVX_OBJ,
+			     UVERBS_ACCESS_READ, 1, 1,
+			     UA_OPTIONAL),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET,
+			   UVERBS_ATTR_MIN_SIZE(sizeof(u32)),
+			   UA_OPTIONAL,
+			   UA_ALLOC_AND_COPY),
+	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_FLAGS,
+			     enum mlx5_ib_create_flow_flags,
+			     UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	MLX5_IB_METHOD_DESTROY_FLOW,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE,
+			UVERBS_OBJECT_FLOW,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
+ADD_UVERBS_METHODS(mlx5_ib_fs,
+		   UVERBS_OBJECT_FLOW,
+		   &UVERBS_METHOD(MLX5_IB_METHOD_CREATE_FLOW),
+		   &UVERBS_METHOD(MLX5_IB_METHOD_DESTROY_FLOW));
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE,
+			UVERBS_OBJECT_FLOW_ACTION,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM,
+			   UVERBS_ATTR_MIN_SIZE(MLX5_UN_SZ_BYTES(
+				   set_add_copy_action_in_auto)),
+			   UA_MANDATORY,
+			   UA_ALLOC_AND_COPY),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE,
+			     enum mlx5_ib_uapi_flow_table_type,
+			     UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE,
+			UVERBS_OBJECT_FLOW_ACTION,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF,
+			   UVERBS_ATTR_MIN_SIZE(1),
+			   UA_ALLOC_AND_COPY,
+			   UA_OPTIONAL),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE,
+			     enum mlx5_ib_uapi_flow_action_packet_reformat_type,
+			     UA_MANDATORY),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE,
+			     enum mlx5_ib_uapi_flow_table_type,
+			     UA_MANDATORY));
+
+ADD_UVERBS_METHODS(
+	mlx5_ib_flow_actions,
+	UVERBS_OBJECT_FLOW_ACTION,
+	&UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER),
+	&UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT));
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_FLOW_MATCHER_CREATE,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE,
+			MLX5_IB_OBJECT_FLOW_MATCHER,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(
+		MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK,
+		UVERBS_ATTR_SIZE(1, sizeof(struct mlx5_ib_match_params)),
+		UA_MANDATORY),
+	UVERBS_ATTR_ENUM_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE,
+			    mlx5_ib_flow_type,
+			    UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA,
+			   UVERBS_ATTR_TYPE(u8),
+			   UA_MANDATORY),
+	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS,
+			     enum ib_flow_flags,
+			     UA_OPTIONAL),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE,
+			     enum mlx5_ib_uapi_flow_table_type,
+			     UA_OPTIONAL));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	MLX5_IB_METHOD_FLOW_MATCHER_DESTROY,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_DESTROY_HANDLE,
+			MLX5_IB_OBJECT_FLOW_MATCHER,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER,
+			    UVERBS_TYPE_ALLOC_IDR(flow_matcher_cleanup),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY));
+
+const struct uapi_definition mlx5_ib_flow_defs[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		MLX5_IB_OBJECT_FLOW_MATCHER),
+	UAPI_DEF_CHAIN_OBJ_TREE(
+		UVERBS_OBJECT_FLOW,
+		&mlx5_ib_fs),
+	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
+				&mlx5_ib_flow_actions),
+	{},
+};
+
+static const struct ib_device_ops flow_ops = {
+	.create_flow = mlx5_ib_create_flow,
+	.destroy_flow = mlx5_ib_destroy_flow,
+	.destroy_flow_action = mlx5_ib_destroy_flow_action,
+};
+
+static const struct ib_device_ops flow_ipsec_ops = {
+	.create_flow_action_esp = mlx5_ib_create_flow_action_esp,
+	.modify_flow_action_esp = mlx5_ib_modify_flow_action_esp,
+};
+
+int mlx5_ib_fs_init(struct mlx5_ib_dev *dev)
+{
+	dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL);
+
+	if (!dev->flow_db)
+		return -ENOMEM;
+
+	mutex_init(&dev->flow_db->lock);
+
+	ib_set_device_ops(&dev->ib_dev, &flow_ops);
+	if (mlx5_accel_ipsec_device_caps(dev->mdev) &
+	    MLX5_ACCEL_IPSEC_CAP_DEVICE)
+		ib_set_device_ops(&dev->ib_dev, &flow_ipsec_ops);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/fs.h b/drivers/infiniband/hw/mlx5/fs.h
new file mode 100644
index 0000000..ad320ad
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/fs.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
+ */
+
+#ifndef _MLX5_IB_FS_H
+#define _MLX5_IB_FS_H
+
+#include "mlx5_ib.h"
+
+#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
+int mlx5_ib_fs_init(struct mlx5_ib_dev *dev);
+#else
+static inline int mlx5_ib_fs_init(struct mlx5_ib_dev *dev)
+{
+	dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL);
+
+	if (!dev->flow_db)
+		return -ENOMEM;
+
+	mutex_init(&dev->flow_db->lock);
+	return 0;
+}
+#endif
+static inline void mlx5_ib_fs_cleanup(struct mlx5_ib_dev *dev)
+{
+	kfree(dev->flow_db);
+}
+#endif /* _MLX5_IB_FS_H */
diff --git a/drivers/infiniband/hw/mlx5/gsi.c b/drivers/infiniband/hw/mlx5/gsi.c
index 5c73c0a..7fcad91 100644
--- a/drivers/infiniband/hw/mlx5/gsi.c
+++ b/drivers/infiniband/hw/mlx5/gsi.c
@@ -35,44 +35,19 @@
 struct mlx5_ib_gsi_wr {
 	struct ib_cqe cqe;
 	struct ib_wc wc;
-	int send_flags;
 	bool completed:1;
 };
 
-struct mlx5_ib_gsi_qp {
-	struct ib_qp ibqp;
-	struct ib_qp *rx_qp;
-	u8 port_num;
-	struct ib_qp_cap cap;
-	enum ib_sig_type sq_sig_type;
-	/* Serialize qp state modifications */
-	struct mutex mutex;
-	struct ib_cq *cq;
-	struct mlx5_ib_gsi_wr *outstanding_wrs;
-	u32 outstanding_pi, outstanding_ci;
-	int num_qps;
-	/* Protects access to the tx_qps. Post send operations synchronize
-	 * with tx_qp creation in setup_qp(). Also protects the
-	 * outstanding_wrs array and indices.
-	 */
-	spinlock_t lock;
-	struct ib_qp **tx_qps;
-};
-
-static struct mlx5_ib_gsi_qp *gsi_qp(struct ib_qp *qp)
-{
-	return container_of(qp, struct mlx5_ib_gsi_qp, ibqp);
-}
-
 static bool mlx5_ib_deth_sqpn_cap(struct mlx5_ib_dev *dev)
 {
 	return MLX5_CAP_GEN(dev->mdev, set_deth_sqpn);
 }
 
 /* Call with gsi->lock locked */
-static void generate_completions(struct mlx5_ib_gsi_qp *gsi)
+static void generate_completions(struct mlx5_ib_qp *mqp)
 {
-	struct ib_cq *gsi_cq = gsi->ibqp.send_cq;
+	struct mlx5_ib_gsi_qp *gsi = &mqp->gsi;
+	struct ib_cq *gsi_cq = mqp->ibqp.send_cq;
 	struct mlx5_ib_gsi_wr *wr;
 	u32 index;
 
@@ -83,10 +58,7 @@
 		if (!wr->completed)
 			break;
 
-		if (gsi->sq_sig_type == IB_SIGNAL_ALL_WR ||
-		    wr->send_flags & IB_SEND_SIGNALED)
-			WARN_ON_ONCE(mlx5_ib_generate_wc(gsi_cq, &wr->wc));
-
+		WARN_ON_ONCE(mlx5_ib_generate_wc(gsi_cq, &wr->wc));
 		wr->completed = false;
 	}
 
@@ -98,6 +70,7 @@
 	struct mlx5_ib_gsi_qp *gsi = cq->cq_context;
 	struct mlx5_ib_gsi_wr *wr =
 		container_of(wc->wr_cqe, struct mlx5_ib_gsi_wr, cqe);
+	struct mlx5_ib_qp *mqp = container_of(gsi, struct mlx5_ib_qp, gsi);
 	u64 wr_id;
 	unsigned long flags;
 
@@ -106,52 +79,43 @@
 	wr_id = wr->wc.wr_id;
 	wr->wc = *wc;
 	wr->wc.wr_id = wr_id;
-	wr->wc.qp = &gsi->ibqp;
+	wr->wc.qp = &mqp->ibqp;
 
-	generate_completions(gsi);
+	generate_completions(mqp);
 	spin_unlock_irqrestore(&gsi->lock, flags);
 }
 
-struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd,
-				    struct ib_qp_init_attr *init_attr)
+int mlx5_ib_create_gsi(struct ib_pd *pd, struct mlx5_ib_qp *mqp,
+		       struct ib_qp_init_attr *attr)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_ib_gsi_qp *gsi;
-	struct ib_qp_init_attr hw_init_attr = *init_attr;
-	const u8 port_num = init_attr->port_num;
-	const int num_pkeys = pd->device->attrs.max_pkeys;
-	const int num_qps = mlx5_ib_deth_sqpn_cap(dev) ? num_pkeys : 0;
+	struct ib_qp_init_attr hw_init_attr = *attr;
+	const u8 port_num = attr->port_num;
+	int num_qps = 0;
 	int ret;
 
-	mlx5_ib_dbg(dev, "creating GSI QP\n");
-
-	if (port_num > ARRAY_SIZE(dev->devr.ports) || port_num < 1) {
-		mlx5_ib_warn(dev,
-			     "invalid port number %d during GSI QP creation\n",
-			     port_num);
-		return ERR_PTR(-EINVAL);
+	if (mlx5_ib_deth_sqpn_cap(dev)) {
+		if (MLX5_CAP_GEN(dev->mdev,
+				 port_type) == MLX5_CAP_PORT_TYPE_IB)
+			num_qps = pd->device->attrs.max_pkeys;
+		else if (dev->lag_active)
+			num_qps = MLX5_MAX_PORTS;
 	}
 
-	gsi = kzalloc(sizeof(*gsi), GFP_KERNEL);
-	if (!gsi)
-		return ERR_PTR(-ENOMEM);
-
+	gsi = &mqp->gsi;
 	gsi->tx_qps = kcalloc(num_qps, sizeof(*gsi->tx_qps), GFP_KERNEL);
-	if (!gsi->tx_qps) {
-		ret = -ENOMEM;
-		goto err_free;
-	}
+	if (!gsi->tx_qps)
+		return -ENOMEM;
 
-	gsi->outstanding_wrs = kcalloc(init_attr->cap.max_send_wr,
-				       sizeof(*gsi->outstanding_wrs),
-				       GFP_KERNEL);
+	gsi->outstanding_wrs =
+		kcalloc(attr->cap.max_send_wr, sizeof(*gsi->outstanding_wrs),
+			GFP_KERNEL);
 	if (!gsi->outstanding_wrs) {
 		ret = -ENOMEM;
 		goto err_free_tx;
 	}
 
-	mutex_init(&gsi->mutex);
-
 	mutex_lock(&dev->devr.mutex);
 
 	if (dev->devr.ports[port_num - 1].gsi) {
@@ -163,12 +127,10 @@
 	gsi->num_qps = num_qps;
 	spin_lock_init(&gsi->lock);
 
-	gsi->cap = init_attr->cap;
-	gsi->sq_sig_type = init_attr->sq_sig_type;
-	gsi->ibqp.qp_num = 1;
+	gsi->cap = attr->cap;
 	gsi->port_num = port_num;
 
-	gsi->cq = ib_alloc_cq(pd->device, gsi, init_attr->cap.max_send_wr, 0,
+	gsi->cq = ib_alloc_cq(pd->device, gsi, attr->cap.max_send_wr, 0,
 			      IB_POLL_SOFTIRQ);
 	if (IS_ERR(gsi->cq)) {
 		mlx5_ib_warn(dev, "unable to create send CQ for GSI QP. error %ld\n",
@@ -184,19 +146,31 @@
 		hw_init_attr.cap.max_send_sge = 0;
 		hw_init_attr.cap.max_inline_data = 0;
 	}
-	gsi->rx_qp = ib_create_qp(pd, &hw_init_attr);
+
+	gsi->rx_qp = mlx5_ib_create_qp(pd, &hw_init_attr, NULL);
 	if (IS_ERR(gsi->rx_qp)) {
 		mlx5_ib_warn(dev, "unable to create hardware GSI QP. error %ld\n",
 			     PTR_ERR(gsi->rx_qp));
 		ret = PTR_ERR(gsi->rx_qp);
 		goto err_destroy_cq;
 	}
+	gsi->rx_qp->device = pd->device;
+	gsi->rx_qp->pd = pd;
+	gsi->rx_qp->real_qp = gsi->rx_qp;
 
-	dev->devr.ports[init_attr->port_num - 1].gsi = gsi;
+	gsi->rx_qp->qp_type = hw_init_attr.qp_type;
+	gsi->rx_qp->send_cq = hw_init_attr.send_cq;
+	gsi->rx_qp->recv_cq = hw_init_attr.recv_cq;
+	gsi->rx_qp->event_handler = hw_init_attr.event_handler;
+	spin_lock_init(&gsi->rx_qp->mr_lock);
+	INIT_LIST_HEAD(&gsi->rx_qp->rdma_mrs);
+	INIT_LIST_HEAD(&gsi->rx_qp->sig_mrs);
+
+	dev->devr.ports[attr->port_num - 1].gsi = gsi;
 
 	mutex_unlock(&dev->devr.mutex);
 
-	return &gsi->ibqp;
+	return 0;
 
 err_destroy_cq:
 	ib_free_cq(gsi->cq);
@@ -205,23 +179,19 @@
 	kfree(gsi->outstanding_wrs);
 err_free_tx:
 	kfree(gsi->tx_qps);
-err_free:
-	kfree(gsi);
-	return ERR_PTR(ret);
+	return ret;
 }
 
-int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp)
+int mlx5_ib_destroy_gsi(struct mlx5_ib_qp *mqp)
 {
-	struct mlx5_ib_dev *dev = to_mdev(qp->device);
-	struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp);
+	struct mlx5_ib_dev *dev = to_mdev(mqp->ibqp.device);
+	struct mlx5_ib_gsi_qp *gsi = &mqp->gsi;
 	const int port_num = gsi->port_num;
 	int qp_index;
 	int ret;
 
-	mlx5_ib_dbg(dev, "destroying GSI QP\n");
-
 	mutex_lock(&dev->devr.mutex);
-	ret = ib_destroy_qp(gsi->rx_qp);
+	ret = mlx5_ib_destroy_qp(gsi->rx_qp, NULL);
 	if (ret) {
 		mlx5_ib_warn(dev, "unable to destroy hardware GSI QP. error %d\n",
 			     ret);
@@ -243,7 +213,7 @@
 
 	kfree(gsi->outstanding_wrs);
 	kfree(gsi->tx_qps);
-	kfree(gsi);
+	kfree(mqp);
 
 	return 0;
 }
@@ -261,16 +231,15 @@
 			.max_send_sge = gsi->cap.max_send_sge,
 			.max_inline_data = gsi->cap.max_inline_data,
 		},
-		.sq_sig_type = gsi->sq_sig_type,
 		.qp_type = IB_QPT_UD,
-		.create_flags = mlx5_ib_create_qp_sqpn_qp1(),
+		.create_flags = MLX5_IB_QP_CREATE_SQPN_QP1,
 	};
 
 	return ib_create_qp(pd, &init_attr);
 }
 
 static int modify_to_rts(struct mlx5_ib_gsi_qp *gsi, struct ib_qp *qp,
-			 u16 qp_index)
+			 u16 pkey_index)
 {
 	struct mlx5_ib_dev *dev = to_mdev(qp->device);
 	struct ib_qp_attr attr;
@@ -279,7 +248,7 @@
 
 	mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY | IB_QP_PORT;
 	attr.qp_state = IB_QPS_INIT;
-	attr.pkey_index = qp_index;
+	attr.pkey_index = pkey_index;
 	attr.qkey = IB_QP1_QKEY;
 	attr.port_num = gsi->port_num;
 	ret = ib_modify_qp(qp, &attr, mask);
@@ -313,12 +282,17 @@
 {
 	struct ib_device *device = gsi->rx_qp->device;
 	struct mlx5_ib_dev *dev = to_mdev(device);
+	int pkey_index = qp_index;
+	struct mlx5_ib_qp *mqp;
 	struct ib_qp *qp;
 	unsigned long flags;
 	u16 pkey;
 	int ret;
 
-	ret = ib_query_pkey(device, gsi->port_num, qp_index, &pkey);
+	if (MLX5_CAP_GEN(dev->mdev,  port_type) != MLX5_CAP_PORT_TYPE_IB)
+		pkey_index = 0;
+
+	ret = ib_query_pkey(device, gsi->port_num, pkey_index, &pkey);
 	if (ret) {
 		mlx5_ib_warn(dev, "unable to read P_Key at port %d, index %d\n",
 			     gsi->port_num, qp_index);
@@ -347,7 +321,10 @@
 		return;
 	}
 
-	ret = modify_to_rts(gsi, qp, qp_index);
+	mqp = to_mqp(qp);
+	if (dev->lag_active)
+		mqp->gsi_lag_port = qp_index + 1;
+	ret = modify_to_rts(gsi, qp, pkey_index);
 	if (ret)
 		goto err_destroy_qp;
 
@@ -364,56 +341,54 @@
 
 static void setup_qps(struct mlx5_ib_gsi_qp *gsi)
 {
+	struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device);
 	u16 qp_index;
 
+	mutex_lock(&dev->devr.mutex);
 	for (qp_index = 0; qp_index < gsi->num_qps; ++qp_index)
 		setup_qp(gsi, qp_index);
+	mutex_unlock(&dev->devr.mutex);
 }
 
 int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr,
 			  int attr_mask)
 {
 	struct mlx5_ib_dev *dev = to_mdev(qp->device);
-	struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	struct mlx5_ib_gsi_qp *gsi = &mqp->gsi;
 	int ret;
 
 	mlx5_ib_dbg(dev, "modifying GSI QP to state %d\n", attr->qp_state);
 
-	mutex_lock(&gsi->mutex);
 	ret = ib_modify_qp(gsi->rx_qp, attr, attr_mask);
 	if (ret) {
 		mlx5_ib_warn(dev, "unable to modify GSI rx QP: %d\n", ret);
-		goto unlock;
+		return ret;
 	}
 
 	if (to_mqp(gsi->rx_qp)->state == IB_QPS_RTS)
 		setup_qps(gsi);
-
-unlock:
-	mutex_unlock(&gsi->mutex);
-
-	return ret;
+	return 0;
 }
 
 int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
 			 int qp_attr_mask,
 			 struct ib_qp_init_attr *qp_init_attr)
 {
-	struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	struct mlx5_ib_gsi_qp *gsi = &mqp->gsi;
 	int ret;
 
-	mutex_lock(&gsi->mutex);
 	ret = ib_query_qp(gsi->rx_qp, qp_attr, qp_attr_mask, qp_init_attr);
 	qp_init_attr->cap = gsi->cap;
-	mutex_unlock(&gsi->mutex);
-
 	return ret;
 }
 
 /* Call with gsi->lock locked */
-static int mlx5_ib_add_outstanding_wr(struct mlx5_ib_gsi_qp *gsi,
+static int mlx5_ib_add_outstanding_wr(struct mlx5_ib_qp *mqp,
 				      struct ib_ud_wr *wr, struct ib_wc *wc)
 {
+	struct mlx5_ib_gsi_qp *gsi = &mqp->gsi;
 	struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device);
 	struct mlx5_ib_gsi_wr *gsi_wr;
 
@@ -442,22 +417,21 @@
 }
 
 /* Call with gsi->lock locked */
-static int mlx5_ib_gsi_silent_drop(struct mlx5_ib_gsi_qp *gsi,
-				    struct ib_ud_wr *wr)
+static int mlx5_ib_gsi_silent_drop(struct mlx5_ib_qp *mqp, struct ib_ud_wr *wr)
 {
 	struct ib_wc wc = {
 		{ .wr_id = wr->wr.wr_id },
 		.status = IB_WC_SUCCESS,
 		.opcode = IB_WC_SEND,
-		.qp = &gsi->ibqp,
+		.qp = &mqp->ibqp,
 	};
 	int ret;
 
-	ret = mlx5_ib_add_outstanding_wr(gsi, wr, &wc);
+	ret = mlx5_ib_add_outstanding_wr(mqp, wr, &wc);
 	if (ret)
 		return ret;
 
-	generate_completions(gsi);
+	generate_completions(mqp);
 
 	return 0;
 }
@@ -466,11 +440,15 @@
 static struct ib_qp *get_tx_qp(struct mlx5_ib_gsi_qp *gsi, struct ib_ud_wr *wr)
 {
 	struct mlx5_ib_dev *dev = to_mdev(gsi->rx_qp->device);
+	struct mlx5_ib_ah *ah = to_mah(wr->ah);
 	int qp_index = wr->pkey_index;
 
-	if (!mlx5_ib_deth_sqpn_cap(dev))
+	if (!gsi->num_qps)
 		return gsi->rx_qp;
 
+	if (dev->lag_active && ah->xmit_port)
+		qp_index = ah->xmit_port - 1;
+
 	if (qp_index >= gsi->num_qps)
 		return NULL;
 
@@ -480,7 +458,8 @@
 int mlx5_ib_gsi_post_send(struct ib_qp *qp, const struct ib_send_wr *wr,
 			  const struct ib_send_wr **bad_wr)
 {
-	struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	struct mlx5_ib_gsi_qp *gsi = &mqp->gsi;
 	struct ib_qp *tx_qp;
 	unsigned long flags;
 	int ret;
@@ -493,14 +472,14 @@
 		spin_lock_irqsave(&gsi->lock, flags);
 		tx_qp = get_tx_qp(gsi, &cur_wr);
 		if (!tx_qp) {
-			ret = mlx5_ib_gsi_silent_drop(gsi, &cur_wr);
+			ret = mlx5_ib_gsi_silent_drop(mqp, &cur_wr);
 			if (ret)
 				goto err;
 			spin_unlock_irqrestore(&gsi->lock, flags);
 			continue;
 		}
 
-		ret = mlx5_ib_add_outstanding_wr(gsi, &cur_wr, NULL);
+		ret = mlx5_ib_add_outstanding_wr(mqp, &cur_wr, NULL);
 		if (ret)
 			goto err;
 
@@ -524,7 +503,8 @@
 int mlx5_ib_gsi_post_recv(struct ib_qp *qp, const struct ib_recv_wr *wr,
 			  const struct ib_recv_wr **bad_wr)
 {
-	struct mlx5_ib_gsi_qp *gsi = gsi_qp(qp);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	struct mlx5_ib_gsi_qp *gsi = &mqp->gsi;
 
 	return ib_post_recv(gsi->rx_qp, wr, bad_wr);
 }
@@ -534,7 +514,5 @@
 	if (!gsi)
 		return;
 
-	mutex_lock(&gsi->mutex);
 	setup_qps(gsi);
-	mutex_unlock(&gsi->mutex);
 }
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
index 74ce924..5c3d052 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.c
+++ b/drivers/infiniband/hw/mlx5/ib_rep.c
@@ -35,7 +35,7 @@
 	int vport_index;
 
 	if (rep->vport == MLX5_VPORT_UPLINK)
-		profile = &uplink_rep_profile;
+		profile = &raw_eth_profile;
 	else
 		return mlx5_ib_set_vport_rep(dev, rep);
 
diff --git a/drivers/infiniband/hw/mlx5/ib_rep.h b/drivers/infiniband/hw/mlx5/ib_rep.h
index de43b42..5b30d3f 100644
--- a/drivers/infiniband/hw/mlx5/ib_rep.h
+++ b/drivers/infiniband/hw/mlx5/ib_rep.h
@@ -9,9 +9,9 @@
 #include <linux/mlx5/eswitch.h>
 #include "mlx5_ib.h"
 
-#ifdef CONFIG_MLX5_ESWITCH
-extern const struct mlx5_ib_profile uplink_rep_profile;
+extern const struct mlx5_ib_profile raw_eth_profile;
 
+#ifdef CONFIG_MLX5_ESWITCH
 u8 mlx5_ib_eswitch_mode(struct mlx5_eswitch *esw);
 struct mlx5_ib_dev *mlx5_ib_get_rep_ibdev(struct mlx5_eswitch *esw,
 					  u16 vport_num);
diff --git a/drivers/infiniband/hw/mlx5/ib_virt.c b/drivers/infiniband/hw/mlx5/ib_virt.c
index 649a336..46b2d37 100644
--- a/drivers/infiniband/hw/mlx5/ib_virt.c
+++ b/drivers/infiniband/hw/mlx5/ib_virt.c
@@ -134,7 +134,7 @@
 	if (!out)
 		return -ENOMEM;
 
-	err = mlx5_core_query_vport_counter(mdev, true, vf, port, out, out_sz);
+	err = mlx5_core_query_vport_counter(mdev, true, vf, port, out);
 	if (err)
 		goto ex;
 
@@ -164,8 +164,10 @@
 	in->field_select = MLX5_HCA_VPORT_SEL_NODE_GUID;
 	in->node_guid = guid;
 	err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
-	if (!err)
+	if (!err) {
 		vfs_ctx[vf].node_guid = guid;
+		vfs_ctx[vf].node_guid_valid = 1;
+	}
 	kfree(in);
 	return err;
 }
@@ -185,8 +187,10 @@
 	in->field_select = MLX5_HCA_VPORT_SEL_PORT_GUID;
 	in->port_guid = guid;
 	err = mlx5_core_modify_hca_vport_context(mdev, 1, 1, vf + 1, in);
-	if (!err)
+	if (!err) {
 		vfs_ctx[vf].port_guid = guid;
+		vfs_ctx[vf].port_guid_valid = 1;
+	}
 	kfree(in);
 	return err;
 }
@@ -201,3 +205,19 @@
 
 	return -EINVAL;
 }
+
+int mlx5_ib_get_vf_guid(struct ib_device *device, int vf, u8 port,
+			struct ifla_vf_guid *node_guid,
+			struct ifla_vf_guid *port_guid)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_vf_context *vfs_ctx = mdev->priv.sriov.vfs_ctx;
+
+	node_guid->guid =
+		vfs_ctx[vf].node_guid_valid ? vfs_ctx[vf].node_guid : 0;
+	port_guid->guid =
+		vfs_ctx[vf].port_guid_valid ? vfs_ctx[vf].port_guid : 0;
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
index 348c1df..9bb9bb0 100644
--- a/drivers/infiniband/hw/mlx5/mad.c
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -30,7 +30,6 @@
  * SOFTWARE.
  */
 
-#include <linux/mlx5/cmd.h>
 #include <linux/mlx5/vport.h>
 #include <rdma/ib_mad.h>
 #include <rdma/ib_smi.h>
@@ -74,58 +73,6 @@
 				port);
 }
 
-static int process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
-		       const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-		       const struct ib_mad *in_mad, struct ib_mad *out_mad)
-{
-	u16 slid;
-	int err;
-
-	slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
-
-	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0)
-		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-
-	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
-	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
-		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
-		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
-			return IB_MAD_RESULT_SUCCESS;
-
-		/* Don't process SMInfo queries -- the SMA can't handle them.
-		 */
-		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO)
-			return IB_MAD_RESULT_SUCCESS;
-	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
-		   in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS1   ||
-		   in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS2   ||
-		   in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) {
-		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
-		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
-			return IB_MAD_RESULT_SUCCESS;
-	} else {
-		return IB_MAD_RESULT_SUCCESS;
-	}
-
-	err = mlx5_MAD_IFC(to_mdev(ibdev),
-			   mad_flags & IB_MAD_IGNORE_MKEY,
-			   mad_flags & IB_MAD_IGNORE_BKEY,
-			   port_num, in_wc, in_grh, in_mad, out_mad);
-	if (err)
-		return IB_MAD_RESULT_FAILURE;
-
-	/* set return bit in status of directed route responses */
-	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
-		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
-
-	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
-		/* no response for trap repress */
-		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
-
-	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
-}
-
 static void pma_cnt_ext_assign(struct ib_pma_portcounters_ext *pma_cnt_ext,
 			       void *out)
 {
@@ -240,8 +187,8 @@
 			goto done;
 		}
 
-		err = mlx5_core_query_vport_counter(mdev, 0, 0,
-						    mdev_port_num, out_cnt, sz);
+		err = mlx5_core_query_vport_counter(mdev, 0, 0, mdev_port_num,
+						    out_cnt);
 		if (!err)
 			pma_cnt_ext_assign(pma_cnt_ext, out_cnt);
 	} else {
@@ -271,30 +218,65 @@
 
 int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-			const struct ib_mad_hdr *in, size_t in_mad_size,
-			struct ib_mad_hdr *out, size_t *out_mad_size,
-			u16 *out_mad_pkey_index)
+			const struct ib_mad *in, struct ib_mad *out,
+			size_t *out_mad_size, u16 *out_mad_pkey_index)
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	const struct ib_mad *in_mad = (const struct ib_mad *)in;
-	struct ib_mad *out_mad = (struct ib_mad *)out;
-	int ret;
+	u8 mgmt_class = in->mad_hdr.mgmt_class;
+	u8 method = in->mad_hdr.method;
+	u16 slid;
+	int err;
 
-	if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
-			 *out_mad_size != sizeof(*out_mad)))
+	slid = in_wc ? ib_lid_cpu16(in_wc->slid) :
+		       be16_to_cpu(IB_LID_PERMISSIVE);
+
+	if (method == IB_MGMT_METHOD_TRAP && !slid)
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	switch (mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: {
+		if (method != IB_MGMT_METHOD_GET &&
+		    method != IB_MGMT_METHOD_SET &&
+		    method != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/* Don't process SMInfo queries -- the SMA can't handle them.
+		 */
+		if (in->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO)
+			return IB_MAD_RESULT_SUCCESS;
+	} break;
+	case IB_MGMT_CLASS_PERF_MGMT:
+		if (MLX5_CAP_GEN(dev->mdev, vport_counters) &&
+		    method == IB_MGMT_METHOD_GET)
+			return process_pma_cmd(dev, port_num, in, out);
+		fallthrough;
+	case MLX5_IB_VENDOR_CLASS1:
+	case MLX5_IB_VENDOR_CLASS2:
+	case IB_MGMT_CLASS_CONG_MGMT: {
+		if (method != IB_MGMT_METHOD_GET &&
+		    method != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} break;
+	default:
+		return IB_MAD_RESULT_SUCCESS;
+	}
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), mad_flags & IB_MAD_IGNORE_MKEY,
+			   mad_flags & IB_MAD_IGNORE_BKEY, port_num, in_wc,
+			   in_grh, in, out);
+	if (err)
 		return IB_MAD_RESULT_FAILURE;
 
-	memset(out_mad->data, 0, sizeof(out_mad->data));
+	/* set return bit in status of directed route responses */
+	if (mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out->mad_hdr.status |= cpu_to_be16(1 << 15);
 
-	if (MLX5_CAP_GEN(dev->mdev, vport_counters) &&
-	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT &&
-	    in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) {
-		ret = process_pma_cmd(dev, port_num, in_mad, out_mad);
-	} else {
-		ret =  process_mad(ibdev, mad_flags, port_num, in_wc, in_grh,
-				   in_mad, out_mad);
-	}
-	return ret;
+	if (method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
 }
 
 int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 9025086..eb69bec 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -1,33 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
 /*
- * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
  */
 
 #include <linux/debugfs.h>
@@ -39,9 +12,6 @@
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
 #include <linux/bitmap.h>
-#if defined(CONFIG_X86)
-#include <asm/pat.h>
-#endif
 #include <linux/sched.h>
 #include <linux/sched/mm.h>
 #include <linux/sched/task.h>
@@ -56,32 +26,32 @@
 #include <linux/list.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_umem.h>
+#include <rdma/lag.h>
 #include <linux/in.h>
 #include <linux/etherdevice.h>
 #include "mlx5_ib.h"
 #include "ib_rep.h"
 #include "cmd.h"
+#include "devx.h"
+#include "fs.h"
 #include "srq.h"
-#include <linux/mlx5/fs_helpers.h>
+#include "qp.h"
+#include "wr.h"
+#include "restrack.h"
+#include "counters.h"
 #include <linux/mlx5/accel.h>
 #include <rdma/uverbs_std_types.h>
 #include <rdma/mlx5_user_ioctl_verbs.h>
 #include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/ib_umem_odp.h>
 
 #define UVERBS_MODULE_NAME mlx5_ib
 #include <rdma/uverbs_named_ioctl.h>
 
-#define DRIVER_NAME "mlx5_ib"
-#define DRIVER_VERSION "5.0-0"
-
 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
-MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
+MODULE_DESCRIPTION("Mellanox 5th generation network adapters (ConnectX series) IB driver");
 MODULE_LICENSE("Dual BSD/GPL");
 
-static char mlx5_version[] =
-	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
-	DRIVER_VERSION "\n";
-
 struct mlx5_ib_event_work {
 	struct work_struct	work;
 	union {
@@ -317,9 +287,6 @@
 		*native_port_num = 1;
 
 	port = &ibdev->port[ib_port_num - 1];
-	if (!port)
-		return NULL;
-
 	spin_lock(&port->mp.mpi_lock);
 	mpi = ibdev->port[ib_port_num - 1].mp.mpi;
 	if (mpi && !mpi->unaffiliate) {
@@ -359,8 +326,8 @@
 	spin_unlock(&port->mp.mpi_lock);
 }
 
-static int translate_eth_legacy_proto_oper(u32 eth_proto_oper, u8 *active_speed,
-					   u8 *active_width)
+static int translate_eth_legacy_proto_oper(u32 eth_proto_oper,
+					   u16 *active_speed, u8 *active_width)
 {
 	switch (eth_proto_oper) {
 	case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII):
@@ -417,7 +384,7 @@
 	return 0;
 }
 
-static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u8 *active_speed,
+static int translate_eth_ext_proto_oper(u32 eth_proto_oper, u16 *active_speed,
 					u8 *active_width)
 {
 	switch (eth_proto_oper) {
@@ -469,7 +436,7 @@
 	return 0;
 }
 
-static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
+static int translate_eth_proto_oper(u32 eth_proto_oper, u16 *active_speed,
 				    u8 *active_width, bool ext)
 {
 	return ext ?
@@ -579,7 +546,7 @@
 			 unsigned int index, const union ib_gid *gid,
 			 const struct ib_gid_attr *attr)
 {
-	enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+	enum ib_gid_type gid_type = IB_GID_TYPE_ROCE;
 	u16 vlan_id = 0xffff;
 	u8 roce_version = 0;
 	u8 roce_l3_type = 0;
@@ -594,7 +561,7 @@
 	}
 
 	switch (gid_type) {
-	case IB_GID_TYPE_IB:
+	case IB_GID_TYPE_ROCE:
 		roce_version = MLX5_ROCE_VERSION_1;
 		break;
 	case IB_GID_TYPE_ROCE_UDP_ENCAP:
@@ -629,8 +596,8 @@
 			     attr->index, NULL, NULL);
 }
 
-__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev,
-			       const struct ib_gid_attr *attr)
+__be16 mlx5_get_roce_udp_sport_min(const struct mlx5_ib_dev *dev,
+				   const struct ib_gid_attr *attr)
 {
 	if (attr->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
 		return 0;
@@ -693,21 +660,6 @@
 	get_atomic_caps(dev, atomic_size_qp, props);
 }
 
-static void get_atomic_caps_dc(struct mlx5_ib_dev *dev,
-			       struct ib_device_attr *props)
-{
-	u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc);
-
-	get_atomic_caps(dev, atomic_size_qp, props);
-}
-
-bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev)
-{
-	struct ib_device_attr props = {};
-
-	get_atomic_caps_dc(dev, &props);
-	return (props.atomic_cap == IB_ATOMIC_HCA) ? true : false;
-}
 static int mlx5_query_system_image_guid(struct ib_device *ibdev,
 					__be64 *sys_image_guid)
 {
@@ -845,8 +797,8 @@
 	resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
 	if (uhw_outlen && uhw_outlen < resp_len)
 		return -EINVAL;
-	else
-		resp.response_length = resp_len;
+
+	resp.response_length = resp_len;
 
 	if (uhw && uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
 		return -EINVAL;
@@ -914,7 +866,7 @@
 			props->raw_packet_caps |=
 				IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
 
-		if (field_avail(typeof(resp), tso_caps, uhw_outlen)) {
+		if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen) {
 			max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
 			if (max_tso) {
 				resp.tso_caps.max_tso = 1 << max_tso;
@@ -924,7 +876,7 @@
 			}
 		}
 
-		if (field_avail(typeof(resp), rss_caps, uhw_outlen)) {
+		if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen) {
 			resp.rss_caps.rx_hash_function =
 						MLX5_RX_HASH_FUNC_TOEPLITZ;
 			resp.rss_caps.rx_hash_fields_mask =
@@ -944,9 +896,9 @@
 			resp.response_length += sizeof(resp.rss_caps);
 		}
 	} else {
-		if (field_avail(typeof(resp), tso_caps, uhw_outlen))
+		if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen)
 			resp.response_length += sizeof(resp.tso_caps);
-		if (field_avail(typeof(resp), rss_caps, uhw_outlen))
+		if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen)
 			resp.response_length += sizeof(resp.rss_caps);
 	}
 
@@ -1014,13 +966,14 @@
 		1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
 	props->max_pi_fast_reg_page_list_len =
 		props->max_fast_reg_page_list_len / 2;
+	props->max_sgl_rd =
+		MLX5_CAP_GEN(mdev, max_sgl_for_optimized_performance);
 	get_atomic_caps_qp(dev, props);
 	props->masked_atomic_cap   = IB_ATOMIC_NONE;
 	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
 	props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
 	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
 					   props->max_mcast_grp;
-	props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
 	props->max_ah = INT_MAX;
 	props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
 	props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
@@ -1029,12 +982,29 @@
 		if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
 			props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
 		props->odp_caps = dev->odp_caps;
+		if (!uhw) {
+			/* ODP for kernel QPs is not implemented for receive
+			 * WQEs and SRQ WQEs
+			 */
+			props->odp_caps.per_transport_caps.rc_odp_caps &=
+				~(IB_ODP_SUPPORT_READ |
+				  IB_ODP_SUPPORT_SRQ_RECV);
+			props->odp_caps.per_transport_caps.uc_odp_caps &=
+				~(IB_ODP_SUPPORT_READ |
+				  IB_ODP_SUPPORT_SRQ_RECV);
+			props->odp_caps.per_transport_caps.ud_odp_caps &=
+				~(IB_ODP_SUPPORT_READ |
+				  IB_ODP_SUPPORT_SRQ_RECV);
+			props->odp_caps.per_transport_caps.xrc_odp_caps &=
+				~(IB_ODP_SUPPORT_READ |
+				  IB_ODP_SUPPORT_SRQ_RECV);
+		}
 	}
 
 	if (MLX5_CAP_GEN(mdev, cd))
 		props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
 
-	if (!mlx5_core_is_pf(mdev))
+	if (mlx5_core_is_vf(mdev))
 		props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
 
 	if (mlx5_ib_port_link_layer(ibdev, 1) ==
@@ -1069,7 +1039,7 @@
 						MLX5_MAX_CQ_PERIOD;
 	}
 
-	if (field_avail(typeof(resp), cqe_comp_caps, uhw_outlen)) {
+	if (offsetofend(typeof(resp), cqe_comp_caps) <= uhw_outlen) {
 		resp.response_length += sizeof(resp.cqe_comp_caps);
 
 		if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) {
@@ -1087,7 +1057,7 @@
 		}
 	}
 
-	if (field_avail(typeof(resp), packet_pacing_caps, uhw_outlen) &&
+	if (offsetofend(typeof(resp), packet_pacing_caps) <= uhw_outlen &&
 	    raw_support) {
 		if (MLX5_CAP_QOS(mdev, packet_pacing) &&
 		    MLX5_CAP_GEN(mdev, qos)) {
@@ -1105,8 +1075,8 @@
 		resp.response_length += sizeof(resp.packet_pacing_caps);
 	}
 
-	if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
-			uhw_outlen)) {
+	if (offsetofend(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes) <=
+	    uhw_outlen) {
 		if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
 			resp.mlx5_ib_support_multi_pkt_send_wqes =
 				MLX5_IB_ALLOW_MPW;
@@ -1119,7 +1089,7 @@
 			sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
 	}
 
-	if (field_avail(typeof(resp), flags, uhw_outlen)) {
+	if (offsetofend(typeof(resp), flags) <= uhw_outlen) {
 		resp.response_length += sizeof(resp.flags);
 
 		if (MLX5_CAP_GEN(mdev, cqe_compression_128))
@@ -1135,7 +1105,7 @@
 		resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
 	}
 
-	if (field_avail(typeof(resp), sw_parsing_caps, uhw_outlen)) {
+	if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) {
 		resp.response_length += sizeof(resp.sw_parsing_caps);
 		if (MLX5_CAP_ETH(mdev, swp)) {
 			resp.sw_parsing_caps.sw_parsing_offloads |=
@@ -1155,7 +1125,7 @@
 		}
 	}
 
-	if (field_avail(typeof(resp), striding_rq_caps, uhw_outlen) &&
+	if (offsetofend(typeof(resp), striding_rq_caps) <= uhw_outlen &&
 	    raw_support) {
 		resp.response_length += sizeof(resp.striding_rq_caps);
 		if (MLX5_CAP_GEN(mdev, striding_rq)) {
@@ -1163,8 +1133,14 @@
 				MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES;
 			resp.striding_rq_caps.max_single_stride_log_num_of_bytes =
 				MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES;
-			resp.striding_rq_caps.min_single_wqe_log_num_of_strides =
-				MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
+			if (MLX5_CAP_GEN(dev->mdev, ext_stride_num_range))
+				resp.striding_rq_caps
+					.min_single_wqe_log_num_of_strides =
+					MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
+			else
+				resp.striding_rq_caps
+					.min_single_wqe_log_num_of_strides =
+					MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES;
 			resp.striding_rq_caps.max_single_wqe_log_num_of_strides =
 				MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES;
 			resp.striding_rq_caps.supported_qpts =
@@ -1172,7 +1148,7 @@
 		}
 	}
 
-	if (field_avail(typeof(resp), tunnel_offloads_caps, uhw_outlen)) {
+	if (offsetofend(typeof(resp), tunnel_offloads_caps) <= uhw_outlen) {
 		resp.response_length += sizeof(resp.tunnel_offloads_caps);
 		if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan))
 			resp.tunnel_offloads_caps |=
@@ -1201,32 +1177,24 @@
 	return 0;
 }
 
-enum mlx5_ib_width {
-	MLX5_IB_WIDTH_1X	= 1 << 0,
-	MLX5_IB_WIDTH_2X	= 1 << 1,
-	MLX5_IB_WIDTH_4X	= 1 << 2,
-	MLX5_IB_WIDTH_8X	= 1 << 3,
-	MLX5_IB_WIDTH_12X	= 1 << 4
-};
-
-static void translate_active_width(struct ib_device *ibdev, u8 active_width,
-				  u8 *ib_width)
+static void translate_active_width(struct ib_device *ibdev, u16 active_width,
+				   u8 *ib_width)
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 
-	if (active_width & MLX5_IB_WIDTH_1X)
+	if (active_width & MLX5_PTYS_WIDTH_1X)
 		*ib_width = IB_WIDTH_1X;
-	else if (active_width & MLX5_IB_WIDTH_2X)
+	else if (active_width & MLX5_PTYS_WIDTH_2X)
 		*ib_width = IB_WIDTH_2X;
-	else if (active_width & MLX5_IB_WIDTH_4X)
+	else if (active_width & MLX5_PTYS_WIDTH_4X)
 		*ib_width = IB_WIDTH_4X;
-	else if (active_width & MLX5_IB_WIDTH_8X)
+	else if (active_width & MLX5_PTYS_WIDTH_8X)
 		*ib_width = IB_WIDTH_8X;
-	else if (active_width & MLX5_IB_WIDTH_12X)
+	else if (active_width & MLX5_PTYS_WIDTH_12X)
 		*ib_width = IB_WIDTH_12X;
 	else {
 		mlx5_ib_dbg(dev, "Invalid active_width %d, setting width to default value: 4x\n",
-			    (int)active_width);
+			    active_width);
 		*ib_width = IB_WIDTH_4X;
 	}
 
@@ -1303,7 +1271,7 @@
 	u16 max_mtu;
 	u16 oper_mtu;
 	int err;
-	u8 ib_link_width_oper;
+	u16 ib_link_width_oper;
 	u8 vl_hw_cap;
 
 	rep = kzalloc(sizeof(*rep), GFP_KERNEL);
@@ -1336,16 +1304,13 @@
 	if (props->port_cap_flags & IB_PORT_CAP_MASK2_SUP)
 		props->port_cap_flags2 = rep->cap_mask2;
 
-	err = mlx5_query_port_link_width_oper(mdev, &ib_link_width_oper, port);
+	err = mlx5_query_ib_port_oper(mdev, &ib_link_width_oper,
+				      &props->active_speed, port);
 	if (err)
 		goto out;
 
 	translate_active_width(ibdev, ib_link_width_oper, &props->active_width);
 
-	err = mlx5_query_port_ib_proto_oper(mdev, &props->active_speed, port);
-	if (err)
-		goto out;
-
 	mlx5_query_port_max_mtu(mdev, &max_mtu, port);
 
 	props->max_mtu = mlx5_mtu_to_ib_mtu(max_mtu);
@@ -1764,6 +1729,92 @@
 	mlx5_ib_disable_lb(dev, true, false);
 }
 
+static int set_ucontext_resp(struct ib_ucontext *uctx,
+			     struct mlx5_ib_alloc_ucontext_resp *resp)
+{
+	struct ib_device *ibdev = uctx->device;
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_ucontext *context = to_mucontext(uctx);
+	struct mlx5_bfreg_info *bfregi = &context->bfregi;
+	int err;
+
+	if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
+		err = mlx5_cmd_dump_fill_mkey(dev->mdev,
+					      &resp->dump_fill_mkey);
+		if (err)
+			return err;
+		resp->comp_mask |=
+			MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY;
+	}
+
+	resp->qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
+	if (dev->wc_support)
+		resp->bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev,
+						      log_bf_reg_size);
+	resp->cache_line_size = cache_line_size();
+	resp->max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
+	resp->max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
+	resp->max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
+	resp->max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
+	resp->max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
+	resp->cqe_version = context->cqe_version;
+	resp->log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
+				MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
+	resp->num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
+					MLX5_CAP_GEN(dev->mdev,
+						     num_of_uars_per_page) : 1;
+
+	if (mlx5_accel_ipsec_device_caps(dev->mdev) &
+				MLX5_ACCEL_IPSEC_CAP_DEVICE) {
+		if (mlx5_get_flow_namespace(dev->mdev,
+				MLX5_FLOW_NAMESPACE_EGRESS))
+			resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM;
+		if (mlx5_accel_ipsec_device_caps(dev->mdev) &
+				MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA)
+			resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA;
+		if (MLX5_CAP_FLOWTABLE(dev->mdev, flow_table_properties_nic_receive.ft_field_support.outer_esp_spi))
+			resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING;
+		if (mlx5_accel_ipsec_device_caps(dev->mdev) &
+				MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN)
+			resp->flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN;
+		/* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */
+	}
+
+	resp->tot_bfregs = bfregi->lib_uar_dyn ? 0 :
+			bfregi->total_num_bfregs - bfregi->num_dyn_bfregs;
+	resp->num_ports = dev->num_ports;
+	resp->cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
+				      MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
+
+	if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
+		mlx5_query_min_inline(dev->mdev, &resp->eth_min_inline);
+		resp->eth_min_inline++;
+	}
+
+	if (dev->mdev->clock_info)
+		resp->clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1);
+
+	/*
+	 * We don't want to expose information from the PCI bar that is located
+	 * after 4096 bytes, so if the arch only supports larger pages, let's
+	 * pretend we don't support reading the HCA's core clock. This is also
+	 * forced by mmap function.
+	 */
+	if (PAGE_SIZE <= 4096) {
+		resp->comp_mask |=
+			MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
+		resp->hca_core_clock_offset =
+			offsetof(struct mlx5_init_seg,
+				 internal_timer_h) % PAGE_SIZE;
+	}
+
+	if (MLX5_CAP_GEN(dev->mdev, ece_support))
+		resp->comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_ECE;
+
+	resp->num_dyn_bfregs = bfregi->num_dyn_bfregs;
+	return 0;
+}
+
 static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
 				  struct ib_udata *udata)
 {
@@ -1771,15 +1822,14 @@
 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
 	struct mlx5_ib_alloc_ucontext_req_v2 req = {};
 	struct mlx5_ib_alloc_ucontext_resp resp = {};
-	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_ib_ucontext *context = to_mucontext(uctx);
 	struct mlx5_bfreg_info *bfregi;
 	int ver;
 	int err;
 	size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
 				     max_cqe_version);
-	u32 dump_fill_mkey;
 	bool lib_uar_4k;
+	bool lib_uar_dyn;
 
 	if (!dev->ib_active)
 		return -EAGAIN;
@@ -1806,40 +1856,15 @@
 	if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
 		return -EINVAL;
 
-	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
-	if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
-		resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
-	resp.cache_line_size = cache_line_size();
-	resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
-	resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
-	resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
-	resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
-	resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
-	resp.cqe_version = min_t(__u8,
-				 (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
-				 req.max_cqe_version);
-	resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
-				MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT;
-	resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
-					MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1;
-	resp.response_length = min(offsetof(typeof(resp), response_length) +
-				   sizeof(resp.response_length), udata->outlen);
-
-	if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_DEVICE) {
-		if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_EGRESS))
-			resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM;
-		if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_REQUIRED_METADATA)
-			resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_REQ_METADATA;
-		if (MLX5_CAP_FLOWTABLE(dev->mdev, flow_table_properties_nic_receive.ft_field_support.outer_esp_spi))
-			resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_SPI_STEERING;
-		if (mlx5_accel_ipsec_device_caps(dev->mdev) & MLX5_ACCEL_IPSEC_CAP_TX_IV_IS_ESN)
-			resp.flow_action_flags |= MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_TX_IV_IS_ESN;
-		/* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */
-	}
-
 	lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
+	lib_uar_dyn = req.lib_caps & MLX5_LIB_CAP_DYN_UAR;
 	bfregi = &context->bfregi;
 
+	if (lib_uar_dyn) {
+		bfregi->lib_uar_dyn = lib_uar_dyn;
+		goto uar_done;
+	}
+
 	/* updates req->total_num_bfregs */
 	err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi);
 	if (err)
@@ -1866,6 +1891,7 @@
 	if (err)
 		goto out_sys_pages;
 
+uar_done:
 	if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
 		err = mlx5_ib_devx_create(dev, true);
 		if (err < 0)
@@ -1878,88 +1904,28 @@
 	if (err)
 		goto out_devx;
 
-	if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
-		err = mlx5_cmd_dump_fill_mkey(dev->mdev, &dump_fill_mkey);
-		if (err)
-			goto out_mdev;
-	}
-
 	INIT_LIST_HEAD(&context->db_page_list);
 	mutex_init(&context->db_page_mutex);
 
-	resp.tot_bfregs = req.total_num_bfregs;
-	resp.num_ports = dev->num_ports;
+	context->cqe_version = min_t(__u8,
+				 (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
+				 req.max_cqe_version);
 
-	if (field_avail(typeof(resp), cqe_version, udata->outlen))
-		resp.response_length += sizeof(resp.cqe_version);
+	err = set_ucontext_resp(uctx, &resp);
+	if (err)
+		goto out_mdev;
 
-	if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
-		resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
-				      MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
-		resp.response_length += sizeof(resp.cmds_supp_uhw);
-	}
-
-	if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) {
-		if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
-			mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline);
-			resp.eth_min_inline++;
-		}
-		resp.response_length += sizeof(resp.eth_min_inline);
-	}
-
-	if (field_avail(typeof(resp), clock_info_versions, udata->outlen)) {
-		if (mdev->clock_info)
-			resp.clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1);
-		resp.response_length += sizeof(resp.clock_info_versions);
-	}
-
-	/*
-	 * We don't want to expose information from the PCI bar that is located
-	 * after 4096 bytes, so if the arch only supports larger pages, let's
-	 * pretend we don't support reading the HCA's core clock. This is also
-	 * forced by mmap function.
-	 */
-	if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
-		if (PAGE_SIZE <= 4096) {
-			resp.comp_mask |=
-				MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
-			resp.hca_core_clock_offset =
-				offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE;
-		}
-		resp.response_length += sizeof(resp.hca_core_clock_offset);
-	}
-
-	if (field_avail(typeof(resp), log_uar_size, udata->outlen))
-		resp.response_length += sizeof(resp.log_uar_size);
-
-	if (field_avail(typeof(resp), num_uars_per_page, udata->outlen))
-		resp.response_length += sizeof(resp.num_uars_per_page);
-
-	if (field_avail(typeof(resp), num_dyn_bfregs, udata->outlen)) {
-		resp.num_dyn_bfregs = bfregi->num_dyn_bfregs;
-		resp.response_length += sizeof(resp.num_dyn_bfregs);
-	}
-
-	if (field_avail(typeof(resp), dump_fill_mkey, udata->outlen)) {
-		if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) {
-			resp.dump_fill_mkey = dump_fill_mkey;
-			resp.comp_mask |=
-				MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY;
-		}
-		resp.response_length += sizeof(resp.dump_fill_mkey);
-	}
-
+	resp.response_length = min(udata->outlen, sizeof(resp));
 	err = ib_copy_to_udata(udata, &resp, resp.response_length);
 	if (err)
 		goto out_mdev;
 
 	bfregi->ver = ver;
 	bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs;
-	context->cqe_version = resp.cqe_version;
 	context->lib_caps = req.lib_caps;
 	print_lib_caps(dev, context->lib_caps);
 
-	if (dev->lag_active) {
+	if (mlx5_ib_lag_should_assign_affinity(dev)) {
 		u8 port = mlx5_core_native_port_num(dev->mdev) - 1;
 
 		atomic_set(&context->tx_port_affinity,
@@ -1988,6 +1954,29 @@
 	return err;
 }
 
+static int mlx5_ib_query_ucontext(struct ib_ucontext *ibcontext,
+				  struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_alloc_ucontext_resp uctx_resp = {};
+	int ret;
+
+	ret = set_ucontext_resp(ibcontext, &uctx_resp);
+	if (ret)
+		return ret;
+
+	uctx_resp.response_length =
+		min_t(size_t,
+		      uverbs_attr_get_len(attrs,
+				MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX),
+		      sizeof(uctx_resp));
+
+	ret = uverbs_copy_to_struct_or_zero(attrs,
+					MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX,
+					&uctx_resp,
+					sizeof(uctx_resp));
+	return ret;
+}
+
 static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
 {
 	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
@@ -2015,6 +2004,17 @@
 	return (dev->mdev->bar_addr >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
 }
 
+static u64 uar_index2paddress(struct mlx5_ib_dev *dev,
+				 int uar_idx)
+{
+	unsigned int fw_uars_per_page;
+
+	fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ?
+				MLX5_UARS_IN_PAGE : 1;
+
+	return (dev->mdev->bar_addr + (uar_idx / fw_uars_per_page) * PAGE_SIZE);
+}
+
 static int get_command(unsigned long offset)
 {
 	return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
@@ -2079,6 +2079,36 @@
 			      virt_to_page(dev->mdev->clock_info));
 }
 
+static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry)
+{
+	struct mlx5_user_mmap_entry *mentry = to_mmmap(entry);
+	struct mlx5_ib_dev *dev = to_mdev(entry->ucontext->device);
+	struct mlx5_var_table *var_table = &dev->var_table;
+	struct mlx5_ib_dm *mdm;
+
+	switch (mentry->mmap_flag) {
+	case MLX5_IB_MMAP_TYPE_MEMIC:
+		mdm = container_of(mentry, struct mlx5_ib_dm, mentry);
+		mlx5_cmd_dealloc_memic(&dev->dm, mdm->dev_addr,
+				       mdm->size);
+		kfree(mdm);
+		break;
+	case MLX5_IB_MMAP_TYPE_VAR:
+		mutex_lock(&var_table->bitmap_lock);
+		clear_bit(mentry->page_idx, var_table->bitmap);
+		mutex_unlock(&var_table->bitmap_lock);
+		kfree(mentry);
+		break;
+	case MLX5_IB_MMAP_TYPE_UAR_WC:
+	case MLX5_IB_MMAP_TYPE_UAR_NC:
+		mlx5_cmd_free_uar(dev->mdev, mentry->page_idx);
+		kfree(mentry);
+		break;
+	default:
+		WARN_ON(true);
+	}
+}
+
 static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
 		    struct vm_area_struct *vma,
 		    struct mlx5_ib_ucontext *context)
@@ -2094,6 +2124,9 @@
 	int max_valid_idx = dyn_uar ? bfregi->num_sys_pages :
 				bfregi->num_static_sys_pages;
 
+	if (bfregi->lib_uar_dyn)
+		return -EINVAL;
+
 	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
 		return -EINVAL;
 
@@ -2111,14 +2144,6 @@
 	switch (cmd) {
 	case MLX5_IB_MMAP_WC_PAGE:
 	case MLX5_IB_MMAP_ALLOC_WC:
-/* Some architectures don't support WC memory */
-#if defined(CONFIG_X86)
-		if (!pat_enabled())
-			return -EPERM;
-#elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU)))
-			return -EPERM;
-#endif
-	/* fall through */
 	case MLX5_IB_MMAP_REGULAR_PAGE:
 		/* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
 		prot = pgprot_writecombine(vma->vm_page_prot);
@@ -2167,7 +2192,7 @@
 	mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
 
 	err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE,
-				prot);
+				prot, NULL);
 	if (err) {
 		mlx5_ib_err(dev,
 			    "rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n",
@@ -2191,25 +2216,68 @@
 	return err;
 }
 
-static int dm_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+static int add_dm_mmap_entry(struct ib_ucontext *context,
+			     struct mlx5_ib_dm *mdm,
+			     u64 address)
 {
-	struct mlx5_ib_ucontext *mctx = to_mucontext(context);
-	struct mlx5_ib_dev *dev = to_mdev(context->device);
-	u16 page_idx = get_extended_index(vma->vm_pgoff);
-	size_t map_size = vma->vm_end - vma->vm_start;
-	u32 npages = map_size >> PAGE_SHIFT;
-	phys_addr_t pfn;
+	mdm->mentry.mmap_flag = MLX5_IB_MMAP_TYPE_MEMIC;
+	mdm->mentry.address = address;
+	return rdma_user_mmap_entry_insert_range(
+			context, &mdm->mentry.rdma_entry,
+			mdm->size,
+			MLX5_IB_MMAP_DEVICE_MEM << 16,
+			(MLX5_IB_MMAP_DEVICE_MEM << 16) + (1UL << 16) - 1);
+}
 
-	if (find_next_zero_bit(mctx->dm_pages, page_idx + npages, page_idx) !=
-	    page_idx + npages)
+static unsigned long mlx5_vma_to_pgoff(struct vm_area_struct *vma)
+{
+	unsigned long idx;
+	u8 command;
+
+	command = get_command(vma->vm_pgoff);
+	idx = get_extended_index(vma->vm_pgoff);
+
+	return (command << 16 | idx);
+}
+
+static int mlx5_ib_mmap_offset(struct mlx5_ib_dev *dev,
+			       struct vm_area_struct *vma,
+			       struct ib_ucontext *ucontext)
+{
+	struct mlx5_user_mmap_entry *mentry;
+	struct rdma_user_mmap_entry *entry;
+	unsigned long pgoff;
+	pgprot_t prot;
+	phys_addr_t pfn;
+	int ret;
+
+	pgoff = mlx5_vma_to_pgoff(vma);
+	entry = rdma_user_mmap_entry_get_pgoff(ucontext, pgoff);
+	if (!entry)
 		return -EINVAL;
 
-	pfn = ((dev->mdev->bar_addr +
-	      MLX5_CAP64_DEV_MEM(dev->mdev, memic_bar_start_addr)) >>
-	      PAGE_SHIFT) +
-	      page_idx;
-	return rdma_user_mmap_io(context, vma, pfn, map_size,
-				 pgprot_writecombine(vma->vm_page_prot));
+	mentry = to_mmmap(entry);
+	pfn = (mentry->address >> PAGE_SHIFT);
+	if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR ||
+	    mentry->mmap_flag == MLX5_IB_MMAP_TYPE_UAR_NC)
+		prot = pgprot_noncached(vma->vm_page_prot);
+	else
+		prot = pgprot_writecombine(vma->vm_page_prot);
+	ret = rdma_user_mmap_io(ucontext, vma, pfn,
+				entry->npages * PAGE_SIZE,
+				prot,
+				entry);
+	rdma_user_mmap_entry_put(&mentry->rdma_entry);
+	return ret;
+}
+
+static u64 mlx5_entry_to_mmap_offset(struct mlx5_user_mmap_entry *entry)
+{
+	u64 cmd = (entry->rdma_entry.start_pgoff >> 16) & 0xFFFF;
+	u64 index = entry->rdma_entry.start_pgoff & 0xFFFF;
+
+	return (((index >> 8) << 16) | (cmd << MLX5_IB_MMAP_CMD_SHIFT) |
+		(index & 0xFF)) << PAGE_SHIFT;
 }
 
 static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
@@ -2222,9 +2290,12 @@
 	command = get_command(vma->vm_pgoff);
 	switch (command) {
 	case MLX5_IB_MMAP_WC_PAGE:
+	case MLX5_IB_MMAP_ALLOC_WC:
+		if (!dev->wc_support)
+			return -EPERM;
+		fallthrough;
 	case MLX5_IB_MMAP_NC_PAGE:
 	case MLX5_IB_MMAP_REGULAR_PAGE:
-	case MLX5_IB_MMAP_ALLOC_WC:
 		return uar_mmap(dev, command, vma, context);
 
 	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
@@ -2247,15 +2318,13 @@
 			PAGE_SHIFT;
 		return rdma_user_mmap_io(&context->ibucontext, vma, pfn,
 					 PAGE_SIZE,
-					 pgprot_noncached(vma->vm_page_prot));
+					 pgprot_noncached(vma->vm_page_prot),
+					 NULL);
 	case MLX5_IB_MMAP_CLOCK_INFO:
 		return mlx5_ib_mmap_clock_info_page(dev, vma, context);
 
-	case MLX5_IB_MMAP_DEVICE_MEM:
-		return dm_mmap(ibcontext, vma);
-
 	default:
-		return -EINVAL;
+		return mlx5_ib_mmap_offset(dev, vma, ibcontext);
 	}
 
 	return 0;
@@ -2276,7 +2345,9 @@
 			return -EPERM;
 
 		if (!(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner) ||
-		      MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, sw_owner)))
+		      MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, sw_owner) ||
+		      MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner_v2) ||
+		      MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, sw_owner_v2)))
 			return -EOPNOTSUPP;
 		break;
 	}
@@ -2291,8 +2362,9 @@
 {
 	struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm;
 	u64 start_offset;
-	u32 page_idx;
+	u16 page_idx;
 	int err;
+	u64 address;
 
 	dm->size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE);
 
@@ -2301,28 +2373,30 @@
 	if (err)
 		return err;
 
-	page_idx = (dm->dev_addr - pci_resource_start(dm_db->dev->pdev, 0) -
-		    MLX5_CAP64_DEV_MEM(dm_db->dev, memic_bar_start_addr)) >>
-		    PAGE_SHIFT;
-
-	err = uverbs_copy_to(attrs,
-			     MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
-			     &page_idx, sizeof(page_idx));
+	address = dm->dev_addr & PAGE_MASK;
+	err = add_dm_mmap_entry(ctx, dm, address);
 	if (err)
 		goto err_dealloc;
 
+	page_idx = dm->mentry.rdma_entry.start_pgoff & 0xFFFF;
+	err = uverbs_copy_to(attrs,
+			     MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
+			     &page_idx,
+			     sizeof(page_idx));
+	if (err)
+		goto err_copy;
+
 	start_offset = dm->dev_addr & ~PAGE_MASK;
 	err = uverbs_copy_to(attrs,
 			     MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
 			     &start_offset, sizeof(start_offset));
 	if (err)
-		goto err_dealloc;
-
-	bitmap_set(to_mucontext(ctx)->dm_pages, page_idx,
-		   DIV_ROUND_UP(dm->size, PAGE_SIZE));
+		goto err_copy;
 
 	return 0;
 
+err_copy:
+	rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry);
 err_dealloc:
 	mlx5_cmd_dealloc_memic(dm_db, dm->dev_addr, dm->size);
 
@@ -2346,7 +2420,7 @@
 	act_size = roundup_pow_of_two(act_size);
 
 	dm->size = act_size;
-	err = mlx5_dm_sw_icm_alloc(dev, type, act_size,
+	err = mlx5_dm_sw_icm_alloc(dev, type, act_size, attr->alignment,
 				   to_mucontext(ctx)->devx_uid, &dm->dev_addr,
 				   &dm->icm_dm.obj_id);
 	if (err)
@@ -2426,23 +2500,13 @@
 	struct mlx5_ib_ucontext *ctx = rdma_udata_to_drv_context(
 		&attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
 	struct mlx5_core_dev *dev = to_mdev(ibdm->device)->mdev;
-	struct mlx5_dm *dm_db = &to_mdev(ibdm->device)->dm;
 	struct mlx5_ib_dm *dm = to_mdm(ibdm);
-	u32 page_idx;
 	int ret;
 
 	switch (dm->type) {
 	case MLX5_IB_UAPI_DM_TYPE_MEMIC:
-		ret = mlx5_cmd_dealloc_memic(dm_db, dm->dev_addr, dm->size);
-		if (ret)
-			return ret;
-
-		page_idx = (dm->dev_addr - pci_resource_start(dev->pdev, 0) -
-			    MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr)) >>
-			    PAGE_SHIFT;
-		bitmap_clear(ctx->dm_pages, page_idx,
-			     DIV_ROUND_UP(dm->size, PAGE_SIZE));
-		break;
+		rdma_user_mmap_entry_remove(&dm->mentry.rdma_entry);
+		return 0;
 	case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
 		ret = mlx5_dm_sw_icm_dealloc(dev, MLX5_SW_ICM_TYPE_STEERING,
 					     dm->size, ctx->devx_uid, dm->dev_addr,
@@ -2473,7 +2537,7 @@
 	struct mlx5_ib_alloc_pd_resp resp;
 	int err;
 	u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
-	u32 in[MLX5_ST_SZ_DW(alloc_pd_in)]   = {};
+	u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {};
 	u16 uid = 0;
 	struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
 		udata, struct mlx5_ib_ucontext, ibucontext);
@@ -2481,8 +2545,7 @@
 	uid = context ? context->devx_uid : 0;
 	MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
 	MLX5_SET(alloc_pd_in, in, uid, uid);
-	err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in),
-			    out, sizeof(out));
+	err = mlx5_cmd_exec_inout(to_mdev(ibdev)->mdev, alloc_pd, in, out);
 	if (err)
 		return err;
 
@@ -2499,1844 +2562,12 @@
 	return 0;
 }
 
-static void mlx5_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+static int mlx5_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *mdev = to_mdev(pd->device);
 	struct mlx5_ib_pd *mpd = to_mpd(pd);
 
-	mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
-}
-
-enum {
-	MATCH_CRITERIA_ENABLE_OUTER_BIT,
-	MATCH_CRITERIA_ENABLE_MISC_BIT,
-	MATCH_CRITERIA_ENABLE_INNER_BIT,
-	MATCH_CRITERIA_ENABLE_MISC2_BIT
-};
-
-#define HEADER_IS_ZERO(match_criteria, headers)			           \
-	!(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
-		    0, MLX5_FLD_SZ_BYTES(fte_match_param, headers)))       \
-
-static u8 get_match_criteria_enable(u32 *match_criteria)
-{
-	u8 match_criteria_enable;
-
-	match_criteria_enable =
-		(!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
-		MATCH_CRITERIA_ENABLE_OUTER_BIT;
-	match_criteria_enable |=
-		(!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
-		MATCH_CRITERIA_ENABLE_MISC_BIT;
-	match_criteria_enable |=
-		(!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
-		MATCH_CRITERIA_ENABLE_INNER_BIT;
-	match_criteria_enable |=
-		(!HEADER_IS_ZERO(match_criteria, misc_parameters_2)) <<
-		MATCH_CRITERIA_ENABLE_MISC2_BIT;
-
-	return match_criteria_enable;
-}
-
-static int set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
-{
-	u8 entry_mask;
-	u8 entry_val;
-	int err = 0;
-
-	if (!mask)
-		goto out;
-
-	entry_mask = MLX5_GET(fte_match_set_lyr_2_4, outer_c,
-			      ip_protocol);
-	entry_val = MLX5_GET(fte_match_set_lyr_2_4, outer_v,
-			     ip_protocol);
-	if (!entry_mask) {
-		MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
-		MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
-		goto out;
-	}
-	/* Don't override existing ip protocol */
-	if (mask != entry_mask || val != entry_val)
-		err = -EINVAL;
-out:
-	return err;
-}
-
-static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val,
-			   bool inner)
-{
-	if (inner) {
-		MLX5_SET(fte_match_set_misc,
-			 misc_c, inner_ipv6_flow_label, mask);
-		MLX5_SET(fte_match_set_misc,
-			 misc_v, inner_ipv6_flow_label, val);
-	} else {
-		MLX5_SET(fte_match_set_misc,
-			 misc_c, outer_ipv6_flow_label, mask);
-		MLX5_SET(fte_match_set_misc,
-			 misc_v, outer_ipv6_flow_label, val);
-	}
-}
-
-static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
-{
-	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask);
-	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val);
-	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2);
-	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
-}
-
-static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask)
-{
-	if (MLX5_GET(fte_match_mpls, set_mask, mpls_label) &&
-	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_LABEL))
-		return -EOPNOTSUPP;
-
-	if (MLX5_GET(fte_match_mpls, set_mask, mpls_exp) &&
-	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_EXP))
-		return -EOPNOTSUPP;
-
-	if (MLX5_GET(fte_match_mpls, set_mask, mpls_s_bos) &&
-	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_S_BOS))
-		return -EOPNOTSUPP;
-
-	if (MLX5_GET(fte_match_mpls, set_mask, mpls_ttl) &&
-	    !(field_support & MLX5_FIELD_SUPPORT_MPLS_TTL))
-		return -EOPNOTSUPP;
-
-	return 0;
-}
-
-#define LAST_ETH_FIELD vlan_tag
-#define LAST_IB_FIELD sl
-#define LAST_IPV4_FIELD tos
-#define LAST_IPV6_FIELD traffic_class
-#define LAST_TCP_UDP_FIELD src_port
-#define LAST_TUNNEL_FIELD tunnel_id
-#define LAST_FLOW_TAG_FIELD tag_id
-#define LAST_DROP_FIELD size
-#define LAST_COUNTERS_FIELD counters
-
-/* Field is the last supported field */
-#define FIELDS_NOT_SUPPORTED(filter, field)\
-	memchr_inv((void *)&filter.field  +\
-		   sizeof(filter.field), 0,\
-		   sizeof(filter) -\
-		   offsetof(typeof(filter), field) -\
-		   sizeof(filter.field))
-
-int parse_flow_flow_action(struct mlx5_ib_flow_action *maction,
-			   bool is_egress,
-			   struct mlx5_flow_act *action)
-{
-
-	switch (maction->ib_action.type) {
-	case IB_FLOW_ACTION_ESP:
-		if (action->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
-				      MLX5_FLOW_CONTEXT_ACTION_DECRYPT))
-			return -EINVAL;
-		/* Currently only AES_GCM keymat is supported by the driver */
-		action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx;
-		action->action |= is_egress ?
-			MLX5_FLOW_CONTEXT_ACTION_ENCRYPT :
-			MLX5_FLOW_CONTEXT_ACTION_DECRYPT;
-		return 0;
-	case IB_FLOW_ACTION_UNSPECIFIED:
-		if (maction->flow_action_raw.sub_type ==
-		    MLX5_IB_FLOW_ACTION_MODIFY_HEADER) {
-			if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
-				return -EINVAL;
-			action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR;
-			action->modify_hdr =
-				maction->flow_action_raw.modify_hdr;
-			return 0;
-		}
-		if (maction->flow_action_raw.sub_type ==
-		    MLX5_IB_FLOW_ACTION_DECAP) {
-			if (action->action & MLX5_FLOW_CONTEXT_ACTION_DECAP)
-				return -EINVAL;
-			action->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP;
-			return 0;
-		}
-		if (maction->flow_action_raw.sub_type ==
-		    MLX5_IB_FLOW_ACTION_PACKET_REFORMAT) {
-			if (action->action &
-			    MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT)
-				return -EINVAL;
-			action->action |=
-				MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
-			action->pkt_reformat =
-				maction->flow_action_raw.pkt_reformat;
-			return 0;
-		}
-		/* fall through */
-	default:
-		return -EOPNOTSUPP;
-	}
-}
-
-static int parse_flow_attr(struct mlx5_core_dev *mdev,
-			   struct mlx5_flow_spec *spec,
-			   const union ib_flow_spec *ib_spec,
-			   const struct ib_flow_attr *flow_attr,
-			   struct mlx5_flow_act *action, u32 prev_type)
-{
-	struct mlx5_flow_context *flow_context = &spec->flow_context;
-	u32 *match_c = spec->match_criteria;
-	u32 *match_v = spec->match_value;
-	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
-					   misc_parameters);
-	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
-					   misc_parameters);
-	void *misc_params2_c = MLX5_ADDR_OF(fte_match_param, match_c,
-					    misc_parameters_2);
-	void *misc_params2_v = MLX5_ADDR_OF(fte_match_param, match_v,
-					    misc_parameters_2);
-	void *headers_c;
-	void *headers_v;
-	int match_ipv;
-	int ret;
-
-	if (ib_spec->type & IB_FLOW_SPEC_INNER) {
-		headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
-					 inner_headers);
-		headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
-					 inner_headers);
-		match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-					ft_field_support.inner_ip_version);
-	} else {
-		headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
-					 outer_headers);
-		headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
-					 outer_headers);
-		match_ipv = MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-					ft_field_support.outer_ip_version);
-	}
-
-	switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) {
-	case IB_FLOW_SPEC_ETH:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
-			return -EOPNOTSUPP;
-
-		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
-					     dmac_47_16),
-				ib_spec->eth.mask.dst_mac);
-		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
-					     dmac_47_16),
-				ib_spec->eth.val.dst_mac);
-
-		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
-					     smac_47_16),
-				ib_spec->eth.mask.src_mac);
-		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
-					     smac_47_16),
-				ib_spec->eth.val.src_mac);
-
-		if (ib_spec->eth.mask.vlan_tag) {
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 cvlan_tag, 1);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 cvlan_tag, 1);
-
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 first_vid, ntohs(ib_spec->eth.val.vlan_tag));
-
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 first_cfi,
-				 ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 first_cfi,
-				 ntohs(ib_spec->eth.val.vlan_tag) >> 12);
-
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 first_prio,
-				 ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 first_prio,
-				 ntohs(ib_spec->eth.val.vlan_tag) >> 13);
-		}
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-			 ethertype, ntohs(ib_spec->eth.mask.ether_type));
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-			 ethertype, ntohs(ib_spec->eth.val.ether_type));
-		break;
-	case IB_FLOW_SPEC_IPV4:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
-			return -EOPNOTSUPP;
-
-		if (match_ipv) {
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 ip_version, 0xf);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 ip_version, MLX5_FS_IPV4_VERSION);
-		} else {
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 ethertype, 0xffff);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 ethertype, ETH_P_IP);
-		}
-
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
-				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
-		       &ib_spec->ipv4.mask.src_ip,
-		       sizeof(ib_spec->ipv4.mask.src_ip));
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
-				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
-		       &ib_spec->ipv4.val.src_ip,
-		       sizeof(ib_spec->ipv4.val.src_ip));
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
-				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
-		       &ib_spec->ipv4.mask.dst_ip,
-		       sizeof(ib_spec->ipv4.mask.dst_ip));
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
-				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
-		       &ib_spec->ipv4.val.dst_ip,
-		       sizeof(ib_spec->ipv4.val.dst_ip));
-
-		set_tos(headers_c, headers_v,
-			ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
-
-		if (set_proto(headers_c, headers_v,
-			      ib_spec->ipv4.mask.proto,
-			      ib_spec->ipv4.val.proto))
-			return -EINVAL;
-		break;
-	case IB_FLOW_SPEC_IPV6:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
-			return -EOPNOTSUPP;
-
-		if (match_ipv) {
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 ip_version, 0xf);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 ip_version, MLX5_FS_IPV6_VERSION);
-		} else {
-			MLX5_SET(fte_match_set_lyr_2_4, headers_c,
-				 ethertype, 0xffff);
-			MLX5_SET(fte_match_set_lyr_2_4, headers_v,
-				 ethertype, ETH_P_IPV6);
-		}
-
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
-				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
-		       &ib_spec->ipv6.mask.src_ip,
-		       sizeof(ib_spec->ipv6.mask.src_ip));
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
-				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
-		       &ib_spec->ipv6.val.src_ip,
-		       sizeof(ib_spec->ipv6.val.src_ip));
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
-				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
-		       &ib_spec->ipv6.mask.dst_ip,
-		       sizeof(ib_spec->ipv6.mask.dst_ip));
-		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
-				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
-		       &ib_spec->ipv6.val.dst_ip,
-		       sizeof(ib_spec->ipv6.val.dst_ip));
-
-		set_tos(headers_c, headers_v,
-			ib_spec->ipv6.mask.traffic_class,
-			ib_spec->ipv6.val.traffic_class);
-
-		if (set_proto(headers_c, headers_v,
-			      ib_spec->ipv6.mask.next_hdr,
-			      ib_spec->ipv6.val.next_hdr))
-			return -EINVAL;
-
-		set_flow_label(misc_params_c, misc_params_v,
-			       ntohl(ib_spec->ipv6.mask.flow_label),
-			       ntohl(ib_spec->ipv6.val.flow_label),
-			       ib_spec->type & IB_FLOW_SPEC_INNER);
-		break;
-	case IB_FLOW_SPEC_ESP:
-		if (ib_spec->esp.mask.seq)
-			return -EOPNOTSUPP;
-
-		MLX5_SET(fte_match_set_misc, misc_params_c, outer_esp_spi,
-			 ntohl(ib_spec->esp.mask.spi));
-		MLX5_SET(fte_match_set_misc, misc_params_v, outer_esp_spi,
-			 ntohl(ib_spec->esp.val.spi));
-		break;
-	case IB_FLOW_SPEC_TCP:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
-					 LAST_TCP_UDP_FIELD))
-			return -EOPNOTSUPP;
-
-		if (set_proto(headers_c, headers_v, 0xff, IPPROTO_TCP))
-			return -EINVAL;
-
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
-			 ntohs(ib_spec->tcp_udp.mask.src_port));
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_sport,
-			 ntohs(ib_spec->tcp_udp.val.src_port));
-
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_dport,
-			 ntohs(ib_spec->tcp_udp.mask.dst_port));
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, tcp_dport,
-			 ntohs(ib_spec->tcp_udp.val.dst_port));
-		break;
-	case IB_FLOW_SPEC_UDP:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
-					 LAST_TCP_UDP_FIELD))
-			return -EOPNOTSUPP;
-
-		if (set_proto(headers_c, headers_v, 0xff, IPPROTO_UDP))
-			return -EINVAL;
-
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
-			 ntohs(ib_spec->tcp_udp.mask.src_port));
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_sport,
-			 ntohs(ib_spec->tcp_udp.val.src_port));
-
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_dport,
-			 ntohs(ib_spec->tcp_udp.mask.dst_port));
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport,
-			 ntohs(ib_spec->tcp_udp.val.dst_port));
-		break;
-	case IB_FLOW_SPEC_GRE:
-		if (ib_spec->gre.mask.c_ks_res0_ver)
-			return -EOPNOTSUPP;
-
-		if (set_proto(headers_c, headers_v, 0xff, IPPROTO_GRE))
-			return -EINVAL;
-
-		MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
-			 0xff);
-		MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
-			 IPPROTO_GRE);
-
-		MLX5_SET(fte_match_set_misc, misc_params_c, gre_protocol,
-			 ntohs(ib_spec->gre.mask.protocol));
-		MLX5_SET(fte_match_set_misc, misc_params_v, gre_protocol,
-			 ntohs(ib_spec->gre.val.protocol));
-
-		memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_c,
-				    gre_key.nvgre.hi),
-		       &ib_spec->gre.mask.key,
-		       sizeof(ib_spec->gre.mask.key));
-		memcpy(MLX5_ADDR_OF(fte_match_set_misc, misc_params_v,
-				    gre_key.nvgre.hi),
-		       &ib_spec->gre.val.key,
-		       sizeof(ib_spec->gre.val.key));
-		break;
-	case IB_FLOW_SPEC_MPLS:
-		switch (prev_type) {
-		case IB_FLOW_SPEC_UDP:
-			if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-						   ft_field_support.outer_first_mpls_over_udp),
-						   &ib_spec->mpls.mask.tag))
-				return -EOPNOTSUPP;
-
-			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
-					    outer_first_mpls_over_udp),
-			       &ib_spec->mpls.val.tag,
-			       sizeof(ib_spec->mpls.val.tag));
-			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
-					    outer_first_mpls_over_udp),
-			       &ib_spec->mpls.mask.tag,
-			       sizeof(ib_spec->mpls.mask.tag));
-			break;
-		case IB_FLOW_SPEC_GRE:
-			if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-						   ft_field_support.outer_first_mpls_over_gre),
-						   &ib_spec->mpls.mask.tag))
-				return -EOPNOTSUPP;
-
-			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
-					    outer_first_mpls_over_gre),
-			       &ib_spec->mpls.val.tag,
-			       sizeof(ib_spec->mpls.val.tag));
-			memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
-					    outer_first_mpls_over_gre),
-			       &ib_spec->mpls.mask.tag,
-			       sizeof(ib_spec->mpls.mask.tag));
-			break;
-		default:
-			if (ib_spec->type & IB_FLOW_SPEC_INNER) {
-				if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-							   ft_field_support.inner_first_mpls),
-							   &ib_spec->mpls.mask.tag))
-					return -EOPNOTSUPP;
-
-				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
-						    inner_first_mpls),
-				       &ib_spec->mpls.val.tag,
-				       sizeof(ib_spec->mpls.val.tag));
-				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
-						    inner_first_mpls),
-				       &ib_spec->mpls.mask.tag,
-				       sizeof(ib_spec->mpls.mask.tag));
-			} else {
-				if (check_mpls_supp_fields(MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-							   ft_field_support.outer_first_mpls),
-							   &ib_spec->mpls.mask.tag))
-					return -EOPNOTSUPP;
-
-				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_v,
-						    outer_first_mpls),
-				       &ib_spec->mpls.val.tag,
-				       sizeof(ib_spec->mpls.val.tag));
-				memcpy(MLX5_ADDR_OF(fte_match_set_misc2, misc_params2_c,
-						    outer_first_mpls),
-				       &ib_spec->mpls.mask.tag,
-				       sizeof(ib_spec->mpls.mask.tag));
-			}
-		}
-		break;
-	case IB_FLOW_SPEC_VXLAN_TUNNEL:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask,
-					 LAST_TUNNEL_FIELD))
-			return -EOPNOTSUPP;
-
-		MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni,
-			 ntohl(ib_spec->tunnel.mask.tunnel_id));
-		MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni,
-			 ntohl(ib_spec->tunnel.val.tunnel_id));
-		break;
-	case IB_FLOW_SPEC_ACTION_TAG:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag,
-					 LAST_FLOW_TAG_FIELD))
-			return -EOPNOTSUPP;
-		if (ib_spec->flow_tag.tag_id >= BIT(24))
-			return -EINVAL;
-
-		flow_context->flow_tag = ib_spec->flow_tag.tag_id;
-		flow_context->flags |= FLOW_CONTEXT_HAS_TAG;
-		break;
-	case IB_FLOW_SPEC_ACTION_DROP:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->drop,
-					 LAST_DROP_FIELD))
-			return -EOPNOTSUPP;
-		action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
-		break;
-	case IB_FLOW_SPEC_ACTION_HANDLE:
-		ret = parse_flow_flow_action(to_mflow_act(ib_spec->action.act),
-			flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS, action);
-		if (ret)
-			return ret;
-		break;
-	case IB_FLOW_SPEC_ACTION_COUNT:
-		if (FIELDS_NOT_SUPPORTED(ib_spec->flow_count,
-					 LAST_COUNTERS_FIELD))
-			return -EOPNOTSUPP;
-
-		/* for now support only one counters spec per flow */
-		if (action->action & MLX5_FLOW_CONTEXT_ACTION_COUNT)
-			return -EINVAL;
-
-		action->counters = ib_spec->flow_count.counters;
-		action->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-/* If a flow could catch both multicast and unicast packets,
- * it won't fall into the multicast flow steering table and this rule
- * could steal other multicast packets.
- */
-static bool flow_is_multicast_only(const struct ib_flow_attr *ib_attr)
-{
-	union ib_flow_spec *flow_spec;
-
-	if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
-	    ib_attr->num_of_specs < 1)
-		return false;
-
-	flow_spec = (union ib_flow_spec *)(ib_attr + 1);
-	if (flow_spec->type == IB_FLOW_SPEC_IPV4) {
-		struct ib_flow_spec_ipv4 *ipv4_spec;
-
-		ipv4_spec = (struct ib_flow_spec_ipv4 *)flow_spec;
-		if (ipv4_is_multicast(ipv4_spec->val.dst_ip))
-			return true;
-
-		return false;
-	}
-
-	if (flow_spec->type == IB_FLOW_SPEC_ETH) {
-		struct ib_flow_spec_eth *eth_spec;
-
-		eth_spec = (struct ib_flow_spec_eth *)flow_spec;
-		return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
-		       is_multicast_ether_addr(eth_spec->val.dst_mac);
-	}
-
-	return false;
-}
-
-enum valid_spec {
-	VALID_SPEC_INVALID,
-	VALID_SPEC_VALID,
-	VALID_SPEC_NA,
-};
-
-static enum valid_spec
-is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev,
-		     const struct mlx5_flow_spec *spec,
-		     const struct mlx5_flow_act *flow_act,
-		     bool egress)
-{
-	const u32 *match_c = spec->match_criteria;
-	bool is_crypto =
-		(flow_act->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT |
-				     MLX5_FLOW_CONTEXT_ACTION_DECRYPT));
-	bool is_ipsec = mlx5_fs_is_ipsec_flow(match_c);
-	bool is_drop = flow_act->action & MLX5_FLOW_CONTEXT_ACTION_DROP;
-
-	/*
-	 * Currently only crypto is supported in egress, when regular egress
-	 * rules would be supported, always return VALID_SPEC_NA.
-	 */
-	if (!is_crypto)
-		return VALID_SPEC_NA;
-
-	return is_crypto && is_ipsec &&
-		(!egress || (!is_drop &&
-			     !(spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG))) ?
-		VALID_SPEC_VALID : VALID_SPEC_INVALID;
-}
-
-static bool is_valid_spec(struct mlx5_core_dev *mdev,
-			  const struct mlx5_flow_spec *spec,
-			  const struct mlx5_flow_act *flow_act,
-			  bool egress)
-{
-	/* We curretly only support ipsec egress flow */
-	return is_valid_esp_aes_gcm(mdev, spec, flow_act, egress) != VALID_SPEC_INVALID;
-}
-
-static bool is_valid_ethertype(struct mlx5_core_dev *mdev,
-			       const struct ib_flow_attr *flow_attr,
-			       bool check_inner)
-{
-	union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
-	int match_ipv = check_inner ?
-			MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-					ft_field_support.inner_ip_version) :
-			MLX5_CAP_FLOWTABLE_NIC_RX(mdev,
-					ft_field_support.outer_ip_version);
-	int inner_bit = check_inner ? IB_FLOW_SPEC_INNER : 0;
-	bool ipv4_spec_valid, ipv6_spec_valid;
-	unsigned int ip_spec_type = 0;
-	bool has_ethertype = false;
-	unsigned int spec_index;
-	bool mask_valid = true;
-	u16 eth_type = 0;
-	bool type_valid;
-
-	/* Validate that ethertype is correct */
-	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
-		if ((ib_spec->type == (IB_FLOW_SPEC_ETH | inner_bit)) &&
-		    ib_spec->eth.mask.ether_type) {
-			mask_valid = (ib_spec->eth.mask.ether_type ==
-				      htons(0xffff));
-			has_ethertype = true;
-			eth_type = ntohs(ib_spec->eth.val.ether_type);
-		} else if ((ib_spec->type == (IB_FLOW_SPEC_IPV4 | inner_bit)) ||
-			   (ib_spec->type == (IB_FLOW_SPEC_IPV6 | inner_bit))) {
-			ip_spec_type = ib_spec->type;
-		}
-		ib_spec = (void *)ib_spec + ib_spec->size;
-	}
-
-	type_valid = (!has_ethertype) || (!ip_spec_type);
-	if (!type_valid && mask_valid) {
-		ipv4_spec_valid = (eth_type == ETH_P_IP) &&
-			(ip_spec_type == (IB_FLOW_SPEC_IPV4 | inner_bit));
-		ipv6_spec_valid = (eth_type == ETH_P_IPV6) &&
-			(ip_spec_type == (IB_FLOW_SPEC_IPV6 | inner_bit));
-
-		type_valid = (ipv4_spec_valid) || (ipv6_spec_valid) ||
-			     (((eth_type == ETH_P_MPLS_UC) ||
-			       (eth_type == ETH_P_MPLS_MC)) && match_ipv);
-	}
-
-	return type_valid;
-}
-
-static bool is_valid_attr(struct mlx5_core_dev *mdev,
-			  const struct ib_flow_attr *flow_attr)
-{
-	return is_valid_ethertype(mdev, flow_attr, false) &&
-	       is_valid_ethertype(mdev, flow_attr, true);
-}
-
-static void put_flow_table(struct mlx5_ib_dev *dev,
-			   struct mlx5_ib_flow_prio *prio, bool ft_added)
-{
-	prio->refcount -= !!ft_added;
-	if (!prio->refcount) {
-		mlx5_destroy_flow_table(prio->flow_table);
-		prio->flow_table = NULL;
-	}
-}
-
-static void counters_clear_description(struct ib_counters *counters)
-{
-	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
-
-	mutex_lock(&mcounters->mcntrs_mutex);
-	kfree(mcounters->counters_data);
-	mcounters->counters_data = NULL;
-	mcounters->cntrs_max_index = 0;
-	mutex_unlock(&mcounters->mcntrs_mutex);
-}
-
-static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
-{
-	struct mlx5_ib_flow_handler *handler = container_of(flow_id,
-							  struct mlx5_ib_flow_handler,
-							  ibflow);
-	struct mlx5_ib_flow_handler *iter, *tmp;
-	struct mlx5_ib_dev *dev = handler->dev;
-
-	mutex_lock(&dev->flow_db->lock);
-
-	list_for_each_entry_safe(iter, tmp, &handler->list, list) {
-		mlx5_del_flow_rules(iter->rule);
-		put_flow_table(dev, iter->prio, true);
-		list_del(&iter->list);
-		kfree(iter);
-	}
-
-	mlx5_del_flow_rules(handler->rule);
-	put_flow_table(dev, handler->prio, true);
-	if (handler->ibcounters &&
-	    atomic_read(&handler->ibcounters->usecnt) == 1)
-		counters_clear_description(handler->ibcounters);
-
-	mutex_unlock(&dev->flow_db->lock);
-	if (handler->flow_matcher)
-		atomic_dec(&handler->flow_matcher->usecnt);
-	kfree(handler);
-
-	return 0;
-}
-
-static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
-{
-	priority *= 2;
-	if (!dont_trap)
-		priority++;
-	return priority;
-}
-
-enum flow_table_type {
-	MLX5_IB_FT_RX,
-	MLX5_IB_FT_TX
-};
-
-#define MLX5_FS_MAX_TYPES	 6
-#define MLX5_FS_MAX_ENTRIES	 BIT(16)
-
-static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns,
-					   struct mlx5_ib_flow_prio *prio,
-					   int priority,
-					   int num_entries, int num_groups,
-					   u32 flags)
-{
-	struct mlx5_flow_table *ft;
-
-	ft = mlx5_create_auto_grouped_flow_table(ns, priority,
-						 num_entries,
-						 num_groups,
-						 0, flags);
-	if (IS_ERR(ft))
-		return ERR_CAST(ft);
-
-	prio->flow_table = ft;
-	prio->refcount = 0;
-	return prio;
-}
-
-static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
-						struct ib_flow_attr *flow_attr,
-						enum flow_table_type ft_type)
-{
-	bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
-	struct mlx5_flow_namespace *ns = NULL;
-	struct mlx5_ib_flow_prio *prio;
-	struct mlx5_flow_table *ft;
-	int max_table_size;
-	int num_entries;
-	int num_groups;
-	bool esw_encap;
-	u32 flags = 0;
-	int priority;
-
-	max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
-						       log_max_ft_size));
-	esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
-		DEVLINK_ESWITCH_ENCAP_MODE_NONE;
-	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
-		enum mlx5_flow_namespace_type fn_type;
-
-		if (flow_is_multicast_only(flow_attr) &&
-		    !dont_trap)
-			priority = MLX5_IB_FLOW_MCAST_PRIO;
-		else
-			priority = ib_prio_to_core_prio(flow_attr->priority,
-							dont_trap);
-		if (ft_type == MLX5_IB_FT_RX) {
-			fn_type = MLX5_FLOW_NAMESPACE_BYPASS;
-			prio = &dev->flow_db->prios[priority];
-			if (!dev->is_rep && !esw_encap &&
-			    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap))
-				flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
-			if (!dev->is_rep && !esw_encap &&
-			    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
-					reformat_l3_tunnel_to_l2))
-				flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
-		} else {
-			max_table_size =
-				BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev,
-							      log_max_ft_size));
-			fn_type = MLX5_FLOW_NAMESPACE_EGRESS;
-			prio = &dev->flow_db->egress_prios[priority];
-			if (!dev->is_rep && !esw_encap &&
-			    MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat))
-				flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
-		}
-		ns = mlx5_get_flow_namespace(dev->mdev, fn_type);
-		num_entries = MLX5_FS_MAX_ENTRIES;
-		num_groups = MLX5_FS_MAX_TYPES;
-	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
-		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
-		ns = mlx5_get_flow_namespace(dev->mdev,
-					     MLX5_FLOW_NAMESPACE_LEFTOVERS);
-		build_leftovers_ft_param(&priority,
-					 &num_entries,
-					 &num_groups);
-		prio = &dev->flow_db->prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
-	} else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
-		if (!MLX5_CAP_FLOWTABLE(dev->mdev,
-					allow_sniffer_and_nic_rx_shared_tir))
-			return ERR_PTR(-ENOTSUPP);
-
-		ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ?
-					     MLX5_FLOW_NAMESPACE_SNIFFER_RX :
-					     MLX5_FLOW_NAMESPACE_SNIFFER_TX);
-
-		prio = &dev->flow_db->sniffer[ft_type];
-		priority = 0;
-		num_entries = 1;
-		num_groups = 1;
-	}
-
-	if (!ns)
-		return ERR_PTR(-ENOTSUPP);
-
-	max_table_size = min_t(int, num_entries, max_table_size);
-
-	ft = prio->flow_table;
-	if (!ft)
-		return _get_prio(ns, prio, priority, max_table_size, num_groups,
-				 flags);
-
-	return prio;
-}
-
-static void set_underlay_qp(struct mlx5_ib_dev *dev,
-			    struct mlx5_flow_spec *spec,
-			    u32 underlay_qpn)
-{
-	void *misc_params_c = MLX5_ADDR_OF(fte_match_param,
-					   spec->match_criteria,
-					   misc_parameters);
-	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
-					   misc_parameters);
-
-	if (underlay_qpn &&
-	    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
-				      ft_field_support.bth_dst_qp)) {
-		MLX5_SET(fte_match_set_misc,
-			 misc_params_v, bth_dst_qp, underlay_qpn);
-		MLX5_SET(fte_match_set_misc,
-			 misc_params_c, bth_dst_qp, 0xffffff);
-	}
-}
-
-static int read_flow_counters(struct ib_device *ibdev,
-			      struct mlx5_read_counters_attr *read_attr)
-{
-	struct mlx5_fc *fc = read_attr->hw_cntrs_hndl;
-	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-
-	return mlx5_fc_query(dev->mdev, fc,
-			     &read_attr->out[IB_COUNTER_PACKETS],
-			     &read_attr->out[IB_COUNTER_BYTES]);
-}
-
-/* flow counters currently expose two counters packets and bytes */
-#define FLOW_COUNTERS_NUM 2
-static int counters_set_description(struct ib_counters *counters,
-				    enum mlx5_ib_counters_type counters_type,
-				    struct mlx5_ib_flow_counters_desc *desc_data,
-				    u32 ncounters)
-{
-	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
-	u32 cntrs_max_index = 0;
-	int i;
-
-	if (counters_type != MLX5_IB_COUNTERS_FLOW)
-		return -EINVAL;
-
-	/* init the fields for the object */
-	mcounters->type = counters_type;
-	mcounters->read_counters = read_flow_counters;
-	mcounters->counters_num = FLOW_COUNTERS_NUM;
-	mcounters->ncounters = ncounters;
-	/* each counter entry have both description and index pair */
-	for (i = 0; i < ncounters; i++) {
-		if (desc_data[i].description > IB_COUNTER_BYTES)
-			return -EINVAL;
-
-		if (cntrs_max_index <= desc_data[i].index)
-			cntrs_max_index = desc_data[i].index + 1;
-	}
-
-	mutex_lock(&mcounters->mcntrs_mutex);
-	mcounters->counters_data = desc_data;
-	mcounters->cntrs_max_index = cntrs_max_index;
-	mutex_unlock(&mcounters->mcntrs_mutex);
-
-	return 0;
-}
-
-#define MAX_COUNTERS_NUM (USHRT_MAX / (sizeof(u32) * 2))
-static int flow_counters_set_data(struct ib_counters *ibcounters,
-				  struct mlx5_ib_create_flow *ucmd)
-{
-	struct mlx5_ib_mcounters *mcounters = to_mcounters(ibcounters);
-	struct mlx5_ib_flow_counters_data *cntrs_data = NULL;
-	struct mlx5_ib_flow_counters_desc *desc_data = NULL;
-	bool hw_hndl = false;
-	int ret = 0;
-
-	if (ucmd && ucmd->ncounters_data != 0) {
-		cntrs_data = ucmd->data;
-		if (cntrs_data->ncounters > MAX_COUNTERS_NUM)
-			return -EINVAL;
-
-		desc_data = kcalloc(cntrs_data->ncounters,
-				    sizeof(*desc_data),
-				    GFP_KERNEL);
-		if (!desc_data)
-			return  -ENOMEM;
-
-		if (copy_from_user(desc_data,
-				   u64_to_user_ptr(cntrs_data->counters_data),
-				   sizeof(*desc_data) * cntrs_data->ncounters)) {
-			ret = -EFAULT;
-			goto free;
-		}
-	}
-
-	if (!mcounters->hw_cntrs_hndl) {
-		mcounters->hw_cntrs_hndl = mlx5_fc_create(
-			to_mdev(ibcounters->device)->mdev, false);
-		if (IS_ERR(mcounters->hw_cntrs_hndl)) {
-			ret = PTR_ERR(mcounters->hw_cntrs_hndl);
-			goto free;
-		}
-		hw_hndl = true;
-	}
-
-	if (desc_data) {
-		/* counters already bound to at least one flow */
-		if (mcounters->cntrs_max_index) {
-			ret = -EINVAL;
-			goto free_hndl;
-		}
-
-		ret = counters_set_description(ibcounters,
-					       MLX5_IB_COUNTERS_FLOW,
-					       desc_data,
-					       cntrs_data->ncounters);
-		if (ret)
-			goto free_hndl;
-
-	} else if (!mcounters->cntrs_max_index) {
-		/* counters not bound yet, must have udata passed */
-		ret = -EINVAL;
-		goto free_hndl;
-	}
-
-	return 0;
-
-free_hndl:
-	if (hw_hndl) {
-		mlx5_fc_destroy(to_mdev(ibcounters->device)->mdev,
-				mcounters->hw_cntrs_hndl);
-		mcounters->hw_cntrs_hndl = NULL;
-	}
-free:
-	kfree(desc_data);
-	return ret;
-}
-
-static void mlx5_ib_set_rule_source_port(struct mlx5_ib_dev *dev,
-					 struct mlx5_flow_spec *spec,
-					 struct mlx5_eswitch_rep *rep)
-{
-	struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
-	void *misc;
-
-	if (mlx5_eswitch_vport_match_metadata_enabled(esw)) {
-		misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
-				    misc_parameters_2);
-
-		MLX5_SET(fte_match_set_misc2, misc, metadata_reg_c_0,
-			 mlx5_eswitch_get_vport_metadata_for_match(esw,
-								   rep->vport));
-		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
-				    misc_parameters_2);
-
-		MLX5_SET_TO_ONES(fte_match_set_misc2, misc, metadata_reg_c_0);
-	} else {
-		misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
-				    misc_parameters);
-
-		MLX5_SET(fte_match_set_misc, misc, source_port, rep->vport);
-
-		misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
-				    misc_parameters);
-
-		MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
-	}
-}
-
-static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
-						      struct mlx5_ib_flow_prio *ft_prio,
-						      const struct ib_flow_attr *flow_attr,
-						      struct mlx5_flow_destination *dst,
-						      u32 underlay_qpn,
-						      struct mlx5_ib_create_flow *ucmd)
-{
-	struct mlx5_flow_table	*ft = ft_prio->flow_table;
-	struct mlx5_ib_flow_handler *handler;
-	struct mlx5_flow_act flow_act = {};
-	struct mlx5_flow_spec *spec;
-	struct mlx5_flow_destination dest_arr[2] = {};
-	struct mlx5_flow_destination *rule_dst = dest_arr;
-	const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
-	unsigned int spec_index;
-	u32 prev_type = 0;
-	int err = 0;
-	int dest_num = 0;
-	bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
-
-	if (!is_valid_attr(dev->mdev, flow_attr))
-		return ERR_PTR(-EINVAL);
-
-	if (dev->is_rep && is_egress)
-		return ERR_PTR(-EINVAL);
-
-	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
-	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
-	if (!handler || !spec) {
-		err = -ENOMEM;
-		goto free;
-	}
-
-	INIT_LIST_HEAD(&handler->list);
-
-	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
-		err = parse_flow_attr(dev->mdev, spec,
-				      ib_flow, flow_attr, &flow_act,
-				      prev_type);
-		if (err < 0)
-			goto free;
-
-		prev_type = ((union ib_flow_spec *)ib_flow)->type;
-		ib_flow += ((union ib_flow_spec *)ib_flow)->size;
-	}
-
-	if (dst && !(flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP)) {
-		memcpy(&dest_arr[0], dst, sizeof(*dst));
-		dest_num++;
-	}
-
-	if (!flow_is_multicast_only(flow_attr))
-		set_underlay_qp(dev, spec, underlay_qpn);
-
-	if (dev->is_rep) {
-		struct mlx5_eswitch_rep *rep;
-
-		rep = dev->port[flow_attr->port - 1].rep;
-		if (!rep) {
-			err = -EINVAL;
-			goto free;
-		}
-
-		mlx5_ib_set_rule_source_port(dev, spec, rep);
-	}
-
-	spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
-
-	if (is_egress &&
-	    !is_valid_spec(dev->mdev, spec, &flow_act, is_egress)) {
-		err = -EINVAL;
-		goto free;
-	}
-
-	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
-		struct mlx5_ib_mcounters *mcounters;
-
-		err = flow_counters_set_data(flow_act.counters, ucmd);
-		if (err)
-			goto free;
-
-		mcounters = to_mcounters(flow_act.counters);
-		handler->ibcounters = flow_act.counters;
-		dest_arr[dest_num].type =
-			MLX5_FLOW_DESTINATION_TYPE_COUNTER;
-		dest_arr[dest_num].counter_id =
-			mlx5_fc_id(mcounters->hw_cntrs_hndl);
-		dest_num++;
-	}
-
-	if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DROP) {
-		if (!dest_num)
-			rule_dst = NULL;
-	} else {
-		if (is_egress)
-			flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
-		else
-			flow_act.action |=
-				dest_num ?  MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
-					MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
-	}
-
-	if ((spec->flow_context.flags & FLOW_CONTEXT_HAS_TAG)  &&
-	    (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
-	     flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
-		mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n",
-			     spec->flow_context.flow_tag, flow_attr->type);
-		err = -EINVAL;
-		goto free;
-	}
-	handler->rule = mlx5_add_flow_rules(ft, spec,
-					    &flow_act,
-					    rule_dst, dest_num);
-
-	if (IS_ERR(handler->rule)) {
-		err = PTR_ERR(handler->rule);
-		goto free;
-	}
-
-	ft_prio->refcount++;
-	handler->prio = ft_prio;
-	handler->dev = dev;
-
-	ft_prio->flow_table = ft;
-free:
-	if (err && handler) {
-		if (handler->ibcounters &&
-		    atomic_read(&handler->ibcounters->usecnt) == 1)
-			counters_clear_description(handler->ibcounters);
-		kfree(handler);
-	}
-	kvfree(spec);
-	return err ? ERR_PTR(err) : handler;
-}
-
-static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
-						     struct mlx5_ib_flow_prio *ft_prio,
-						     const struct ib_flow_attr *flow_attr,
-						     struct mlx5_flow_destination *dst)
-{
-	return _create_flow_rule(dev, ft_prio, flow_attr, dst, 0, NULL);
-}
-
-static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
-							  struct mlx5_ib_flow_prio *ft_prio,
-							  struct ib_flow_attr *flow_attr,
-							  struct mlx5_flow_destination *dst)
-{
-	struct mlx5_ib_flow_handler *handler_dst = NULL;
-	struct mlx5_ib_flow_handler *handler = NULL;
-
-	handler = create_flow_rule(dev, ft_prio, flow_attr, NULL);
-	if (!IS_ERR(handler)) {
-		handler_dst = create_flow_rule(dev, ft_prio,
-					       flow_attr, dst);
-		if (IS_ERR(handler_dst)) {
-			mlx5_del_flow_rules(handler->rule);
-			ft_prio->refcount--;
-			kfree(handler);
-			handler = handler_dst;
-		} else {
-			list_add(&handler_dst->list, &handler->list);
-		}
-	}
-
-	return handler;
-}
-enum {
-	LEFTOVERS_MC,
-	LEFTOVERS_UC,
-};
-
-static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
-							  struct mlx5_ib_flow_prio *ft_prio,
-							  struct ib_flow_attr *flow_attr,
-							  struct mlx5_flow_destination *dst)
-{
-	struct mlx5_ib_flow_handler *handler_ucast = NULL;
-	struct mlx5_ib_flow_handler *handler = NULL;
-
-	static struct {
-		struct ib_flow_attr	flow_attr;
-		struct ib_flow_spec_eth eth_flow;
-	} leftovers_specs[] = {
-		[LEFTOVERS_MC] = {
-			.flow_attr = {
-				.num_of_specs = 1,
-				.size = sizeof(leftovers_specs[0])
-			},
-			.eth_flow = {
-				.type = IB_FLOW_SPEC_ETH,
-				.size = sizeof(struct ib_flow_spec_eth),
-				.mask = {.dst_mac = {0x1} },
-				.val =  {.dst_mac = {0x1} }
-			}
-		},
-		[LEFTOVERS_UC] = {
-			.flow_attr = {
-				.num_of_specs = 1,
-				.size = sizeof(leftovers_specs[0])
-			},
-			.eth_flow = {
-				.type = IB_FLOW_SPEC_ETH,
-				.size = sizeof(struct ib_flow_spec_eth),
-				.mask = {.dst_mac = {0x1} },
-				.val = {.dst_mac = {} }
-			}
-		}
-	};
-
-	handler = create_flow_rule(dev, ft_prio,
-				   &leftovers_specs[LEFTOVERS_MC].flow_attr,
-				   dst);
-	if (!IS_ERR(handler) &&
-	    flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
-		handler_ucast = create_flow_rule(dev, ft_prio,
-						 &leftovers_specs[LEFTOVERS_UC].flow_attr,
-						 dst);
-		if (IS_ERR(handler_ucast)) {
-			mlx5_del_flow_rules(handler->rule);
-			ft_prio->refcount--;
-			kfree(handler);
-			handler = handler_ucast;
-		} else {
-			list_add(&handler_ucast->list, &handler->list);
-		}
-	}
-
-	return handler;
-}
-
-static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
-							struct mlx5_ib_flow_prio *ft_rx,
-							struct mlx5_ib_flow_prio *ft_tx,
-							struct mlx5_flow_destination *dst)
-{
-	struct mlx5_ib_flow_handler *handler_rx;
-	struct mlx5_ib_flow_handler *handler_tx;
-	int err;
-	static const struct ib_flow_attr flow_attr  = {
-		.num_of_specs = 0,
-		.size = sizeof(flow_attr)
-	};
-
-	handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst);
-	if (IS_ERR(handler_rx)) {
-		err = PTR_ERR(handler_rx);
-		goto err;
-	}
-
-	handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst);
-	if (IS_ERR(handler_tx)) {
-		err = PTR_ERR(handler_tx);
-		goto err_tx;
-	}
-
-	list_add(&handler_tx->list, &handler_rx->list);
-
-	return handler_rx;
-
-err_tx:
-	mlx5_del_flow_rules(handler_rx->rule);
-	ft_rx->refcount--;
-	kfree(handler_rx);
-err:
-	return ERR_PTR(err);
-}
-
-static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
-					   struct ib_flow_attr *flow_attr,
-					   int domain,
-					   struct ib_udata *udata)
-{
-	struct mlx5_ib_dev *dev = to_mdev(qp->device);
-	struct mlx5_ib_qp *mqp = to_mqp(qp);
-	struct mlx5_ib_flow_handler *handler = NULL;
-	struct mlx5_flow_destination *dst = NULL;
-	struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
-	struct mlx5_ib_flow_prio *ft_prio;
-	bool is_egress = flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS;
-	struct mlx5_ib_create_flow *ucmd = NULL, ucmd_hdr;
-	size_t min_ucmd_sz, required_ucmd_sz;
-	int err;
-	int underlay_qpn;
-
-	if (udata && udata->inlen) {
-		min_ucmd_sz = offsetof(typeof(ucmd_hdr), reserved) +
-				sizeof(ucmd_hdr.reserved);
-		if (udata->inlen < min_ucmd_sz)
-			return ERR_PTR(-EOPNOTSUPP);
-
-		err = ib_copy_from_udata(&ucmd_hdr, udata, min_ucmd_sz);
-		if (err)
-			return ERR_PTR(err);
-
-		/* currently supports only one counters data */
-		if (ucmd_hdr.ncounters_data > 1)
-			return ERR_PTR(-EINVAL);
-
-		required_ucmd_sz = min_ucmd_sz +
-			sizeof(struct mlx5_ib_flow_counters_data) *
-			ucmd_hdr.ncounters_data;
-		if (udata->inlen > required_ucmd_sz &&
-		    !ib_is_udata_cleared(udata, required_ucmd_sz,
-					 udata->inlen - required_ucmd_sz))
-			return ERR_PTR(-EOPNOTSUPP);
-
-		ucmd = kzalloc(required_ucmd_sz, GFP_KERNEL);
-		if (!ucmd)
-			return ERR_PTR(-ENOMEM);
-
-		err = ib_copy_from_udata(ucmd, udata, required_ucmd_sz);
-		if (err)
-			goto free_ucmd;
-	}
-
-	if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) {
-		err = -ENOMEM;
-		goto free_ucmd;
-	}
-
-	if (domain != IB_FLOW_DOMAIN_USER ||
-	    flow_attr->port > dev->num_ports ||
-	    (flow_attr->flags & ~(IB_FLOW_ATTR_FLAGS_DONT_TRAP |
-				  IB_FLOW_ATTR_FLAGS_EGRESS))) {
-		err = -EINVAL;
-		goto free_ucmd;
-	}
-
-	if (is_egress &&
-	    (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
-	     flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) {
-		err = -EINVAL;
-		goto free_ucmd;
-	}
-
-	dst = kzalloc(sizeof(*dst), GFP_KERNEL);
-	if (!dst) {
-		err = -ENOMEM;
-		goto free_ucmd;
-	}
-
-	mutex_lock(&dev->flow_db->lock);
-
-	ft_prio = get_flow_table(dev, flow_attr,
-				 is_egress ? MLX5_IB_FT_TX : MLX5_IB_FT_RX);
-	if (IS_ERR(ft_prio)) {
-		err = PTR_ERR(ft_prio);
-		goto unlock;
-	}
-	if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
-		ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX);
-		if (IS_ERR(ft_prio_tx)) {
-			err = PTR_ERR(ft_prio_tx);
-			ft_prio_tx = NULL;
-			goto destroy_ft;
-		}
-	}
-
-	if (is_egress) {
-		dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT;
-	} else {
-		dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
-		if (mqp->flags & MLX5_IB_QP_RSS)
-			dst->tir_num = mqp->rss_qp.tirn;
-		else
-			dst->tir_num = mqp->raw_packet_qp.rq.tirn;
-	}
-
-	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
-		if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)  {
-			handler = create_dont_trap_rule(dev, ft_prio,
-							flow_attr, dst);
-		} else {
-			underlay_qpn = (mqp->flags & MLX5_IB_QP_UNDERLAY) ?
-					mqp->underlay_qpn : 0;
-			handler = _create_flow_rule(dev, ft_prio, flow_attr,
-						    dst, underlay_qpn, ucmd);
-		}
-	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
-		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
-		handler = create_leftovers_rule(dev, ft_prio, flow_attr,
-						dst);
-	} else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
-		handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst);
-	} else {
-		err = -EINVAL;
-		goto destroy_ft;
-	}
-
-	if (IS_ERR(handler)) {
-		err = PTR_ERR(handler);
-		handler = NULL;
-		goto destroy_ft;
-	}
-
-	mutex_unlock(&dev->flow_db->lock);
-	kfree(dst);
-	kfree(ucmd);
-
-	return &handler->ibflow;
-
-destroy_ft:
-	put_flow_table(dev, ft_prio, false);
-	if (ft_prio_tx)
-		put_flow_table(dev, ft_prio_tx, false);
-unlock:
-	mutex_unlock(&dev->flow_db->lock);
-	kfree(dst);
-free_ucmd:
-	kfree(ucmd);
-	return ERR_PTR(err);
-}
-
-static struct mlx5_ib_flow_prio *
-_get_flow_table(struct mlx5_ib_dev *dev,
-		struct mlx5_ib_flow_matcher *fs_matcher,
-		bool mcast)
-{
-	struct mlx5_flow_namespace *ns = NULL;
-	struct mlx5_ib_flow_prio *prio = NULL;
-	int max_table_size = 0;
-	bool esw_encap;
-	u32 flags = 0;
-	int priority;
-
-	if (mcast)
-		priority = MLX5_IB_FLOW_MCAST_PRIO;
-	else
-		priority = ib_prio_to_core_prio(fs_matcher->priority, false);
-
-	esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) !=
-		DEVLINK_ESWITCH_ENCAP_MODE_NONE;
-	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) {
-		max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
-					log_max_ft_size));
-		if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap) && !esw_encap)
-			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
-		if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
-					      reformat_l3_tunnel_to_l2) &&
-		    !esw_encap)
-			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
-	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS) {
-		max_table_size = BIT(
-			MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, log_max_ft_size));
-		if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat) && !esw_encap)
-			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
-	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB) {
-		max_table_size = BIT(
-			MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, log_max_ft_size));
-		if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, decap) && esw_encap)
-			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
-		if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, reformat_l3_tunnel_to_l2) &&
-		    esw_encap)
-			flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
-		priority = FDB_BYPASS_PATH;
-	} else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) {
-		max_table_size =
-			BIT(MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev,
-						       log_max_ft_size));
-		priority = fs_matcher->priority;
-	}
-
-	max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES);
-
-	ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type);
-	if (!ns)
-		return ERR_PTR(-ENOTSUPP);
-
-	if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS)
-		prio = &dev->flow_db->prios[priority];
-	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS)
-		prio = &dev->flow_db->egress_prios[priority];
-	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB)
-		prio = &dev->flow_db->fdb;
-	else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX)
-		prio = &dev->flow_db->rdma_rx[priority];
-
-	if (!prio)
-		return ERR_PTR(-EINVAL);
-
-	if (prio->flow_table)
-		return prio;
-
-	return _get_prio(ns, prio, priority, max_table_size,
-			 MLX5_FS_MAX_TYPES, flags);
-}
-
-static struct mlx5_ib_flow_handler *
-_create_raw_flow_rule(struct mlx5_ib_dev *dev,
-		      struct mlx5_ib_flow_prio *ft_prio,
-		      struct mlx5_flow_destination *dst,
-		      struct mlx5_ib_flow_matcher  *fs_matcher,
-		      struct mlx5_flow_context *flow_context,
-		      struct mlx5_flow_act *flow_act,
-		      void *cmd_in, int inlen,
-		      int dst_num)
-{
-	struct mlx5_ib_flow_handler *handler;
-	struct mlx5_flow_spec *spec;
-	struct mlx5_flow_table *ft = ft_prio->flow_table;
-	int err = 0;
-
-	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
-	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
-	if (!handler || !spec) {
-		err = -ENOMEM;
-		goto free;
-	}
-
-	INIT_LIST_HEAD(&handler->list);
-
-	memcpy(spec->match_value, cmd_in, inlen);
-	memcpy(spec->match_criteria, fs_matcher->matcher_mask.match_params,
-	       fs_matcher->mask_len);
-	spec->match_criteria_enable = fs_matcher->match_criteria_enable;
-	spec->flow_context = *flow_context;
-
-	handler->rule = mlx5_add_flow_rules(ft, spec,
-					    flow_act, dst, dst_num);
-
-	if (IS_ERR(handler->rule)) {
-		err = PTR_ERR(handler->rule);
-		goto free;
-	}
-
-	ft_prio->refcount++;
-	handler->prio = ft_prio;
-	handler->dev = dev;
-	ft_prio->flow_table = ft;
-
-free:
-	if (err)
-		kfree(handler);
-	kvfree(spec);
-	return err ? ERR_PTR(err) : handler;
-}
-
-static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher,
-				void *match_v)
-{
-	void *match_c;
-	void *match_v_set_lyr_2_4, *match_c_set_lyr_2_4;
-	void *dmac, *dmac_mask;
-	void *ipv4, *ipv4_mask;
-
-	if (!(fs_matcher->match_criteria_enable &
-	      (1 << MATCH_CRITERIA_ENABLE_OUTER_BIT)))
-		return false;
-
-	match_c = fs_matcher->matcher_mask.match_params;
-	match_v_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_v,
-					   outer_headers);
-	match_c_set_lyr_2_4 = MLX5_ADDR_OF(fte_match_param, match_c,
-					   outer_headers);
-
-	dmac = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
-			    dmac_47_16);
-	dmac_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
-				 dmac_47_16);
-
-	if (is_multicast_ether_addr(dmac) &&
-	    is_multicast_ether_addr(dmac_mask))
-		return true;
-
-	ipv4 = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_v_set_lyr_2_4,
-			    dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
-
-	ipv4_mask = MLX5_ADDR_OF(fte_match_set_lyr_2_4, match_c_set_lyr_2_4,
-				 dst_ipv4_dst_ipv6.ipv4_layout.ipv4);
-
-	if (ipv4_is_multicast(*(__be32 *)(ipv4)) &&
-	    ipv4_is_multicast(*(__be32 *)(ipv4_mask)))
-		return true;
-
-	return false;
-}
-
-struct mlx5_ib_flow_handler *
-mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev,
-			struct mlx5_ib_flow_matcher *fs_matcher,
-			struct mlx5_flow_context *flow_context,
-			struct mlx5_flow_act *flow_act,
-			u32 counter_id,
-			void *cmd_in, int inlen, int dest_id,
-			int dest_type)
-{
-	struct mlx5_flow_destination *dst;
-	struct mlx5_ib_flow_prio *ft_prio;
-	struct mlx5_ib_flow_handler *handler;
-	int dst_num = 0;
-	bool mcast;
-	int err;
-
-	if (fs_matcher->flow_type != MLX5_IB_FLOW_TYPE_NORMAL)
-		return ERR_PTR(-EOPNOTSUPP);
-
-	if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO)
-		return ERR_PTR(-ENOMEM);
-
-	dst = kcalloc(2, sizeof(*dst), GFP_KERNEL);
-	if (!dst)
-		return ERR_PTR(-ENOMEM);
-
-	mcast = raw_fs_is_multicast(fs_matcher, cmd_in);
-	mutex_lock(&dev->flow_db->lock);
-
-	ft_prio = _get_flow_table(dev, fs_matcher, mcast);
-	if (IS_ERR(ft_prio)) {
-		err = PTR_ERR(ft_prio);
-		goto unlock;
-	}
-
-	if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) {
-		dst[dst_num].type = dest_type;
-		dst[dst_num].tir_num = dest_id;
-		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
-	} else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) {
-		dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM;
-		dst[dst_num].ft_num = dest_id;
-		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
-	} else {
-		dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_PORT;
-		flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW;
-	}
-
-	dst_num++;
-
-	if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
-		dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
-		dst[dst_num].counter_id = counter_id;
-		dst_num++;
-	}
-
-	handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher,
-					flow_context, flow_act,
-					cmd_in, inlen, dst_num);
-
-	if (IS_ERR(handler)) {
-		err = PTR_ERR(handler);
-		goto destroy_ft;
-	}
-
-	mutex_unlock(&dev->flow_db->lock);
-	atomic_inc(&fs_matcher->usecnt);
-	handler->flow_matcher = fs_matcher;
-
-	kfree(dst);
-
-	return handler;
-
-destroy_ft:
-	put_flow_table(dev, ft_prio, false);
-unlock:
-	mutex_unlock(&dev->flow_db->lock);
-	kfree(dst);
-
-	return ERR_PTR(err);
-}
-
-static u32 mlx5_ib_flow_action_flags_to_accel_xfrm_flags(u32 mlx5_flags)
-{
-	u32 flags = 0;
-
-	if (mlx5_flags & MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA)
-		flags |= MLX5_ACCEL_XFRM_FLAG_REQUIRE_METADATA;
-
-	return flags;
-}
-
-#define MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED	MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA
-static struct ib_flow_action *
-mlx5_ib_create_flow_action_esp(struct ib_device *device,
-			       const struct ib_flow_action_attrs_esp *attr,
-			       struct uverbs_attr_bundle *attrs)
-{
-	struct mlx5_ib_dev *mdev = to_mdev(device);
-	struct ib_uverbs_flow_action_esp_keymat_aes_gcm *aes_gcm;
-	struct mlx5_accel_esp_xfrm_attrs accel_attrs = {};
-	struct mlx5_ib_flow_action *action;
-	u64 action_flags;
-	u64 flags;
-	int err = 0;
-
-	err = uverbs_get_flags64(
-		&action_flags, attrs, MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
-		((MLX5_FLOW_ACTION_ESP_CREATE_LAST_SUPPORTED << 1) - 1));
-	if (err)
-		return ERR_PTR(err);
-
-	flags = mlx5_ib_flow_action_flags_to_accel_xfrm_flags(action_flags);
-
-	/* We current only support a subset of the standard features. Only a
-	 * keymat of type AES_GCM, with icv_len == 16, iv_algo == SEQ and esn
-	 * (with overlap). Full offload mode isn't supported.
-	 */
-	if (!attr->keymat || attr->replay || attr->encap ||
-	    attr->spi || attr->seq || attr->tfc_pad ||
-	    attr->hard_limit_pkts ||
-	    (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
-			     IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)))
-		return ERR_PTR(-EOPNOTSUPP);
-
-	if (attr->keymat->protocol !=
-	    IB_UVERBS_FLOW_ACTION_ESP_KEYMAT_AES_GCM)
-		return ERR_PTR(-EOPNOTSUPP);
-
-	aes_gcm = &attr->keymat->keymat.aes_gcm;
-
-	if (aes_gcm->icv_len != 16 ||
-	    aes_gcm->iv_algo != IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ)
-		return ERR_PTR(-EOPNOTSUPP);
-
-	action = kmalloc(sizeof(*action), GFP_KERNEL);
-	if (!action)
-		return ERR_PTR(-ENOMEM);
-
-	action->esp_aes_gcm.ib_flags = attr->flags;
-	memcpy(&accel_attrs.keymat.aes_gcm.aes_key, &aes_gcm->aes_key,
-	       sizeof(accel_attrs.keymat.aes_gcm.aes_key));
-	accel_attrs.keymat.aes_gcm.key_len = aes_gcm->key_len * 8;
-	memcpy(&accel_attrs.keymat.aes_gcm.salt, &aes_gcm->salt,
-	       sizeof(accel_attrs.keymat.aes_gcm.salt));
-	memcpy(&accel_attrs.keymat.aes_gcm.seq_iv, &aes_gcm->iv,
-	       sizeof(accel_attrs.keymat.aes_gcm.seq_iv));
-	accel_attrs.keymat.aes_gcm.icv_len = aes_gcm->icv_len * 8;
-	accel_attrs.keymat.aes_gcm.iv_algo = MLX5_ACCEL_ESP_AES_GCM_IV_ALGO_SEQ;
-	accel_attrs.keymat_type = MLX5_ACCEL_ESP_KEYMAT_AES_GCM;
-
-	accel_attrs.esn = attr->esn;
-	if (attr->flags & IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED)
-		accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_TRIGGERED;
-	if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
-		accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
-
-	if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT)
-		accel_attrs.action |= MLX5_ACCEL_ESP_ACTION_ENCRYPT;
-
-	action->esp_aes_gcm.ctx =
-		mlx5_accel_esp_create_xfrm(mdev->mdev, &accel_attrs, flags);
-	if (IS_ERR(action->esp_aes_gcm.ctx)) {
-		err = PTR_ERR(action->esp_aes_gcm.ctx);
-		goto err_parse;
-	}
-
-	action->esp_aes_gcm.ib_flags = attr->flags;
-
-	return &action->ib_action;
-
-err_parse:
-	kfree(action);
-	return ERR_PTR(err);
-}
-
-static int
-mlx5_ib_modify_flow_action_esp(struct ib_flow_action *action,
-			       const struct ib_flow_action_attrs_esp *attr,
-			       struct uverbs_attr_bundle *attrs)
-{
-	struct mlx5_ib_flow_action *maction = to_mflow_act(action);
-	struct mlx5_accel_esp_xfrm_attrs accel_attrs;
-	int err = 0;
-
-	if (attr->keymat || attr->replay || attr->encap ||
-	    attr->spi || attr->seq || attr->tfc_pad ||
-	    attr->hard_limit_pkts ||
-	    (attr->flags & ~(IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
-			     IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS |
-			     IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)))
-		return -EOPNOTSUPP;
-
-	/* Only the ESN value or the MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP can
-	 * be modified.
-	 */
-	if (!(maction->esp_aes_gcm.ib_flags &
-	      IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED) &&
-	    attr->flags & (IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED |
-			   IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW))
-		return -EINVAL;
-
-	memcpy(&accel_attrs, &maction->esp_aes_gcm.ctx->attrs,
-	       sizeof(accel_attrs));
-
-	accel_attrs.esn = attr->esn;
-	if (attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW)
-		accel_attrs.flags |= MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
-	else
-		accel_attrs.flags &= ~MLX5_ACCEL_ESP_FLAGS_ESN_STATE_OVERLAP;
-
-	err = mlx5_accel_esp_modify_xfrm(maction->esp_aes_gcm.ctx,
-					 &accel_attrs);
-	if (err)
-		return err;
-
-	maction->esp_aes_gcm.ib_flags &=
-		~IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
-	maction->esp_aes_gcm.ib_flags |=
-		attr->flags & IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW;
-
-	return 0;
-}
-
-static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action)
-{
-	struct mlx5_ib_flow_action *maction = to_mflow_act(action);
-
-	switch (action->type) {
-	case IB_FLOW_ACTION_ESP:
-		/*
-		 * We only support aes_gcm by now, so we implicitly know this is
-		 * the underline crypto.
-		 */
-		mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx);
-		break;
-	case IB_FLOW_ACTION_UNSPECIFIED:
-		mlx5_ib_destroy_flow_action_raw(maction);
-		break;
-	default:
-		WARN_ON(true);
-		break;
-	}
-
-	kfree(maction);
-	return 0;
+	return mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
 }
 
 static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
@@ -4349,7 +2580,7 @@
 	uid = ibqp->pd ?
 		to_mpd(ibqp->pd)->uid : 0;
 
-	if (mqp->flags & MLX5_IB_QP_UNDERLAY) {
+	if (mqp->flags & IB_QP_CREATE_SOURCE_QPN) {
 		mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n");
 		return -EOPNOTSUPP;
 	}
@@ -4461,9 +2692,7 @@
 		container_of(work, struct mlx5_ib_port_resources,
 			     pkey_change_work);
 
-	mutex_lock(&ports->devr->mutex);
 	mlx5_ib_gsi_pkey_change(ports->gsi);
-	mutex_unlock(&ports->devr->mutex);
 }
 
 static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
@@ -4535,8 +2764,7 @@
 	atomic_inc(&delay_drop->events_cnt);
 
 	mutex_lock(&delay_drop->lock);
-	err = mlx5_core_set_delay_drop(delay_drop->dev->mdev,
-				       delay_drop->timeout);
+	err = mlx5_core_set_delay_drop(delay_drop->dev, delay_drop->timeout);
 	if (err) {
 		mlx5_ib_warn(delay_drop->dev, "Failed to set delay drop, timeout=%u\n",
 			     delay_drop->timeout);
@@ -4635,7 +2863,7 @@
 		break;
 	case MLX5_EVENT_TYPE_GENERAL_EVENT:
 		handle_general_event(ibdev, work->param, &ibev);
-		/* fall through */
+		fallthrough;
 	default:
 		goto out;
 	}
@@ -4783,137 +3011,6 @@
 	return __get_port_caps(dev, port);
 }
 
-static void destroy_umrc_res(struct mlx5_ib_dev *dev)
-{
-	int err;
-
-	err = mlx5_mr_cache_cleanup(dev);
-	if (err)
-		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
-
-	if (dev->umrc.qp)
-		mlx5_ib_destroy_qp(dev->umrc.qp, NULL);
-	if (dev->umrc.cq)
-		ib_free_cq(dev->umrc.cq);
-	if (dev->umrc.pd)
-		ib_dealloc_pd(dev->umrc.pd);
-}
-
-enum {
-	MAX_UMR_WR = 128,
-};
-
-static int create_umr_res(struct mlx5_ib_dev *dev)
-{
-	struct ib_qp_init_attr *init_attr = NULL;
-	struct ib_qp_attr *attr = NULL;
-	struct ib_pd *pd;
-	struct ib_cq *cq;
-	struct ib_qp *qp;
-	int ret;
-
-	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
-	init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
-	if (!attr || !init_attr) {
-		ret = -ENOMEM;
-		goto error_0;
-	}
-
-	pd = ib_alloc_pd(&dev->ib_dev, 0);
-	if (IS_ERR(pd)) {
-		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
-		ret = PTR_ERR(pd);
-		goto error_0;
-	}
-
-	cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
-	if (IS_ERR(cq)) {
-		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
-		ret = PTR_ERR(cq);
-		goto error_2;
-	}
-
-	init_attr->send_cq = cq;
-	init_attr->recv_cq = cq;
-	init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
-	init_attr->cap.max_send_wr = MAX_UMR_WR;
-	init_attr->cap.max_send_sge = 1;
-	init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
-	init_attr->port_num = 1;
-	qp = mlx5_ib_create_qp(pd, init_attr, NULL);
-	if (IS_ERR(qp)) {
-		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
-		ret = PTR_ERR(qp);
-		goto error_3;
-	}
-	qp->device     = &dev->ib_dev;
-	qp->real_qp    = qp;
-	qp->uobject    = NULL;
-	qp->qp_type    = MLX5_IB_QPT_REG_UMR;
-	qp->send_cq    = init_attr->send_cq;
-	qp->recv_cq    = init_attr->recv_cq;
-
-	attr->qp_state = IB_QPS_INIT;
-	attr->port_num = 1;
-	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
-				IB_QP_PORT, NULL);
-	if (ret) {
-		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
-		goto error_4;
-	}
-
-	memset(attr, 0, sizeof(*attr));
-	attr->qp_state = IB_QPS_RTR;
-	attr->path_mtu = IB_MTU_256;
-
-	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
-	if (ret) {
-		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
-		goto error_4;
-	}
-
-	memset(attr, 0, sizeof(*attr));
-	attr->qp_state = IB_QPS_RTS;
-	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
-	if (ret) {
-		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
-		goto error_4;
-	}
-
-	dev->umrc.qp = qp;
-	dev->umrc.cq = cq;
-	dev->umrc.pd = pd;
-
-	sema_init(&dev->umrc.sem, MAX_UMR_WR);
-	ret = mlx5_mr_cache_init(dev);
-	if (ret) {
-		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
-		goto error_4;
-	}
-
-	kfree(attr);
-	kfree(init_attr);
-
-	return 0;
-
-error_4:
-	mlx5_ib_destroy_qp(qp, NULL);
-	dev->umrc.qp = NULL;
-
-error_3:
-	ib_free_cq(cq);
-	dev->umrc.cq = NULL;
-
-error_2:
-	ib_dealloc_pd(pd);
-	dev->umrc.pd = NULL;
-
-error_0:
-	kfree(attr);
-	kfree(init_attr);
-	return ret;
-}
-
 static u8 mlx5_get_umr_fence(u8 umr_fence_cap)
 {
 	switch (umr_fence_cap) {
@@ -4926,18 +3023,20 @@
 	}
 }
 
-static int create_dev_resources(struct mlx5_ib_resources *devr)
+static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev)
 {
+	struct mlx5_ib_resources *devr = &dev->devr;
 	struct ib_srq_init_attr attr;
-	struct mlx5_ib_dev *dev;
 	struct ib_device *ibdev;
 	struct ib_cq_init_attr cq_attr = {.cqe = 1};
 	int port;
 	int ret = 0;
 
-	dev = container_of(devr, struct mlx5_ib_dev, devr);
 	ibdev = &dev->ib_dev;
 
+	if (!MLX5_CAP_GEN(dev->mdev, xrc))
+		return -EOPNOTSUPP;
+
 	mutex_init(&devr->mutex);
 
 	devr->p0 = rdma_zalloc_drv_obj(ibdev, ib_pd);
@@ -4965,34 +3064,19 @@
 	if (ret)
 		goto err_create_cq;
 
-	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
-	if (IS_ERR(devr->x0)) {
-		ret = PTR_ERR(devr->x0);
+	ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn0, 0);
+	if (ret)
 		goto error2;
-	}
-	devr->x0->device = &dev->ib_dev;
-	devr->x0->inode = NULL;
-	atomic_set(&devr->x0->usecnt, 0);
-	mutex_init(&devr->x0->tgt_qp_mutex);
-	INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
 
-	devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
-	if (IS_ERR(devr->x1)) {
-		ret = PTR_ERR(devr->x1);
+	ret = mlx5_cmd_xrcd_alloc(dev->mdev, &devr->xrcdn1, 0);
+	if (ret)
 		goto error3;
-	}
-	devr->x1->device = &dev->ib_dev;
-	devr->x1->inode = NULL;
-	atomic_set(&devr->x1->usecnt, 0);
-	mutex_init(&devr->x1->tgt_qp_mutex);
-	INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
 
 	memset(&attr, 0, sizeof(attr));
 	attr.attr.max_sge = 1;
 	attr.attr.max_wr = 1;
 	attr.srq_type = IB_SRQT_XRC;
 	attr.ext.cq = devr->c0;
-	attr.ext.xrc.xrcd = devr->x0;
 
 	devr->s0 = rdma_zalloc_drv_obj(ibdev, ib_srq);
 	if (!devr->s0) {
@@ -5003,13 +3087,11 @@
 	devr->s0->device	= &dev->ib_dev;
 	devr->s0->pd		= devr->p0;
 	devr->s0->srq_type      = IB_SRQT_XRC;
-	devr->s0->ext.xrc.xrcd	= devr->x0;
 	devr->s0->ext.cq	= devr->c0;
 	ret = mlx5_ib_create_srq(devr->s0, &attr, NULL);
 	if (ret)
 		goto err_create;
 
-	atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
 	atomic_inc(&devr->s0->ext.cq->usecnt);
 	atomic_inc(&devr->p0->usecnt);
 	atomic_set(&devr->s0->usecnt, 0);
@@ -5036,11 +3118,9 @@
 	atomic_inc(&devr->p0->usecnt);
 	atomic_set(&devr->s1->usecnt, 0);
 
-	for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) {
+	for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
 		INIT_WORK(&devr->ports[port].pkey_change_work,
 			  pkey_change_handler);
-		devr->ports[port].devr = devr;
-	}
 
 	return 0;
 
@@ -5051,9 +3131,9 @@
 err_create:
 	kfree(devr->s0);
 error4:
-	mlx5_ib_dealloc_xrcd(devr->x1, NULL);
+	mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0);
 error3:
-	mlx5_ib_dealloc_xrcd(devr->x0, NULL);
+	mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0);
 error2:
 	mlx5_ib_destroy_cq(devr->c0, NULL);
 err_create_cq:
@@ -5065,16 +3145,17 @@
 	return ret;
 }
 
-static void destroy_dev_resources(struct mlx5_ib_resources *devr)
+static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev)
 {
+	struct mlx5_ib_resources *devr = &dev->devr;
 	int port;
 
 	mlx5_ib_destroy_srq(devr->s1, NULL);
 	kfree(devr->s1);
 	mlx5_ib_destroy_srq(devr->s0, NULL);
 	kfree(devr->s0);
-	mlx5_ib_dealloc_xrcd(devr->x0, NULL);
-	mlx5_ib_dealloc_xrcd(devr->x1, NULL);
+	mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn1, 0);
+	mlx5_cmd_xrcd_dealloc(dev->mdev, devr->xrcdn0, 0);
 	mlx5_ib_destroy_cq(devr->c0, NULL);
 	kfree(devr->c0);
 	mlx5_ib_dealloc_pd(devr->p0, NULL);
@@ -5142,8 +3223,7 @@
 	immutable->pkey_tbl_len = attr.pkey_tbl_len;
 	immutable->gid_tbl_len = attr.gid_tbl_len;
 	immutable->core_cap_flags = get_core_cap_flags(ibdev, &rep);
-	if ((ll == IB_LINK_LAYER_INFINIBAND) || MLX5_CAP_GEN(dev->mdev, roce))
-		immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
 
 	return 0;
 }
@@ -5246,11 +3326,9 @@
 {
 	int err;
 
-	if (MLX5_CAP_GEN(dev->mdev, roce)) {
-		err = mlx5_nic_vport_enable_roce(dev->mdev);
-		if (err)
-			return err;
-	}
+	err = mlx5_nic_vport_enable_roce(dev->mdev);
+	if (err)
+		return err;
 
 	err = mlx5_eth_lag_init(dev);
 	if (err)
@@ -5259,8 +3337,7 @@
 	return 0;
 
 err_disable_roce:
-	if (MLX5_CAP_GEN(dev->mdev, roce))
-		mlx5_nic_vport_disable_roce(dev->mdev);
+	mlx5_nic_vport_disable_roce(dev->mdev);
 
 	return err;
 }
@@ -5268,432 +3345,7 @@
 static void mlx5_disable_eth(struct mlx5_ib_dev *dev)
 {
 	mlx5_eth_lag_cleanup(dev);
-	if (MLX5_CAP_GEN(dev->mdev, roce))
-		mlx5_nic_vport_disable_roce(dev->mdev);
-}
-
-struct mlx5_ib_counter {
-	const char *name;
-	size_t offset;
-};
-
-#define INIT_Q_COUNTER(_name)		\
-	{ .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)}
-
-static const struct mlx5_ib_counter basic_q_cnts[] = {
-	INIT_Q_COUNTER(rx_write_requests),
-	INIT_Q_COUNTER(rx_read_requests),
-	INIT_Q_COUNTER(rx_atomic_requests),
-	INIT_Q_COUNTER(out_of_buffer),
-};
-
-static const struct mlx5_ib_counter out_of_seq_q_cnts[] = {
-	INIT_Q_COUNTER(out_of_sequence),
-};
-
-static const struct mlx5_ib_counter retrans_q_cnts[] = {
-	INIT_Q_COUNTER(duplicate_request),
-	INIT_Q_COUNTER(rnr_nak_retry_err),
-	INIT_Q_COUNTER(packet_seq_err),
-	INIT_Q_COUNTER(implied_nak_seq_err),
-	INIT_Q_COUNTER(local_ack_timeout_err),
-};
-
-#define INIT_CONG_COUNTER(_name)		\
-	{ .name = #_name, .offset =	\
-		MLX5_BYTE_OFF(query_cong_statistics_out, _name ## _high)}
-
-static const struct mlx5_ib_counter cong_cnts[] = {
-	INIT_CONG_COUNTER(rp_cnp_ignored),
-	INIT_CONG_COUNTER(rp_cnp_handled),
-	INIT_CONG_COUNTER(np_ecn_marked_roce_packets),
-	INIT_CONG_COUNTER(np_cnp_sent),
-};
-
-static const struct mlx5_ib_counter extended_err_cnts[] = {
-	INIT_Q_COUNTER(resp_local_length_error),
-	INIT_Q_COUNTER(resp_cqe_error),
-	INIT_Q_COUNTER(req_cqe_error),
-	INIT_Q_COUNTER(req_remote_invalid_request),
-	INIT_Q_COUNTER(req_remote_access_errors),
-	INIT_Q_COUNTER(resp_remote_access_errors),
-	INIT_Q_COUNTER(resp_cqe_flush_error),
-	INIT_Q_COUNTER(req_cqe_flush_error),
-};
-
-#define INIT_EXT_PPCNT_COUNTER(_name)		\
-	{ .name = #_name, .offset =	\
-	MLX5_BYTE_OFF(ppcnt_reg, \
-		      counter_set.eth_extended_cntrs_grp_data_layout._name##_high)}
-
-static const struct mlx5_ib_counter ext_ppcnt_cnts[] = {
-	INIT_EXT_PPCNT_COUNTER(rx_icrc_encapsulated),
-};
-
-static bool is_mdev_switchdev_mode(const struct mlx5_core_dev *mdev)
-{
-	return MLX5_ESWITCH_MANAGER(mdev) &&
-	       mlx5_ib_eswitch_mode(mdev->priv.eswitch) ==
-		       MLX5_ESWITCH_OFFLOADS;
-}
-
-static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev)
-{
-	int num_cnt_ports;
-	int i;
-
-	num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
-
-	for (i = 0; i < num_cnt_ports; i++) {
-		if (dev->port[i].cnts.set_id_valid)
-			mlx5_core_dealloc_q_counter(dev->mdev,
-						    dev->port[i].cnts.set_id);
-		kfree(dev->port[i].cnts.names);
-		kfree(dev->port[i].cnts.offsets);
-	}
-}
-
-static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev,
-				    struct mlx5_ib_counters *cnts)
-{
-	u32 num_counters;
-
-	num_counters = ARRAY_SIZE(basic_q_cnts);
-
-	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt))
-		num_counters += ARRAY_SIZE(out_of_seq_q_cnts);
-
-	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters))
-		num_counters += ARRAY_SIZE(retrans_q_cnts);
-
-	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters))
-		num_counters += ARRAY_SIZE(extended_err_cnts);
-
-	cnts->num_q_counters = num_counters;
-
-	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
-		cnts->num_cong_counters = ARRAY_SIZE(cong_cnts);
-		num_counters += ARRAY_SIZE(cong_cnts);
-	}
-	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
-		cnts->num_ext_ppcnt_counters = ARRAY_SIZE(ext_ppcnt_cnts);
-		num_counters += ARRAY_SIZE(ext_ppcnt_cnts);
-	}
-	cnts->names = kcalloc(num_counters, sizeof(cnts->names), GFP_KERNEL);
-	if (!cnts->names)
-		return -ENOMEM;
-
-	cnts->offsets = kcalloc(num_counters,
-				sizeof(cnts->offsets), GFP_KERNEL);
-	if (!cnts->offsets)
-		goto err_names;
-
-	return 0;
-
-err_names:
-	kfree(cnts->names);
-	cnts->names = NULL;
-	return -ENOMEM;
-}
-
-static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev,
-				  const char **names,
-				  size_t *offsets)
-{
-	int i;
-	int j = 0;
-
-	for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) {
-		names[j] = basic_q_cnts[i].name;
-		offsets[j] = basic_q_cnts[i].offset;
-	}
-
-	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) {
-		for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) {
-			names[j] = out_of_seq_q_cnts[i].name;
-			offsets[j] = out_of_seq_q_cnts[i].offset;
-		}
-	}
-
-	if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
-		for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) {
-			names[j] = retrans_q_cnts[i].name;
-			offsets[j] = retrans_q_cnts[i].offset;
-		}
-	}
-
-	if (MLX5_CAP_GEN(dev->mdev, enhanced_error_q_counters)) {
-		for (i = 0; i < ARRAY_SIZE(extended_err_cnts); i++, j++) {
-			names[j] = extended_err_cnts[i].name;
-			offsets[j] = extended_err_cnts[i].offset;
-		}
-	}
-
-	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
-		for (i = 0; i < ARRAY_SIZE(cong_cnts); i++, j++) {
-			names[j] = cong_cnts[i].name;
-			offsets[j] = cong_cnts[i].offset;
-		}
-	}
-
-	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
-		for (i = 0; i < ARRAY_SIZE(ext_ppcnt_cnts); i++, j++) {
-			names[j] = ext_ppcnt_cnts[i].name;
-			offsets[j] = ext_ppcnt_cnts[i].offset;
-		}
-	}
-}
-
-static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev)
-{
-	int num_cnt_ports;
-	int err = 0;
-	int i;
-	bool is_shared;
-
-	is_shared = MLX5_CAP_GEN(dev->mdev, log_max_uctx) != 0;
-	num_cnt_ports = is_mdev_switchdev_mode(dev->mdev) ? 1 : dev->num_ports;
-
-	for (i = 0; i < num_cnt_ports; i++) {
-		err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts);
-		if (err)
-			goto err_alloc;
-
-		mlx5_ib_fill_counters(dev, dev->port[i].cnts.names,
-				      dev->port[i].cnts.offsets);
-
-		err = mlx5_cmd_alloc_q_counter(dev->mdev,
-					       &dev->port[i].cnts.set_id,
-					       is_shared ?
-					       MLX5_SHARED_RESOURCE_UID : 0);
-		if (err) {
-			mlx5_ib_warn(dev,
-				     "couldn't allocate queue counter for port %d, err %d\n",
-				     i + 1, err);
-			goto err_alloc;
-		}
-		dev->port[i].cnts.set_id_valid = true;
-	}
-	return 0;
-
-err_alloc:
-	mlx5_ib_dealloc_counters(dev);
-	return err;
-}
-
-static const struct mlx5_ib_counters *get_counters(struct mlx5_ib_dev *dev,
-						   u8 port_num)
-{
-	return is_mdev_switchdev_mode(dev->mdev) ? &dev->port[0].cnts :
-						   &dev->port[port_num].cnts;
-}
-
-/**
- * mlx5_ib_get_counters_id - Returns counters id to use for device+port
- * @dev:	Pointer to mlx5 IB device
- * @port_num:	Zero based port number
- *
- * mlx5_ib_get_counters_id() Returns counters set id to use for given
- * device port combination in switchdev and non switchdev mode of the
- * parent device.
- */
-u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num)
-{
-	const struct mlx5_ib_counters *cnts = get_counters(dev, port_num);
-
-	return cnts->set_id;
-}
-
-static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
-						    u8 port_num)
-{
-	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	const struct mlx5_ib_counters *cnts;
-	bool is_switchdev = is_mdev_switchdev_mode(dev->mdev);
-
-	if ((is_switchdev && port_num) || (!is_switchdev && !port_num))
-		return NULL;
-
-	cnts = get_counters(dev, port_num - 1);
-
-	return rdma_alloc_hw_stats_struct(cnts->names,
-					  cnts->num_q_counters +
-					  cnts->num_cong_counters +
-					  cnts->num_ext_ppcnt_counters,
-					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
-}
-
-static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev,
-				    const struct mlx5_ib_counters *cnts,
-				    struct rdma_hw_stats *stats,
-				    u16 set_id)
-{
-	int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
-	void *out;
-	__be32 val;
-	int ret, i;
-
-	out = kvzalloc(outlen, GFP_KERNEL);
-	if (!out)
-		return -ENOMEM;
-
-	ret = mlx5_core_query_q_counter(mdev, set_id, 0, out, outlen);
-	if (ret)
-		goto free;
-
-	for (i = 0; i < cnts->num_q_counters; i++) {
-		val = *(__be32 *)(out + cnts->offsets[i]);
-		stats->value[i] = (u64)be32_to_cpu(val);
-	}
-
-free:
-	kvfree(out);
-	return ret;
-}
-
-static int mlx5_ib_query_ext_ppcnt_counters(struct mlx5_ib_dev *dev,
-					    const struct mlx5_ib_counters *cnts,
-					    struct rdma_hw_stats *stats)
-{
-	int offset = cnts->num_q_counters + cnts->num_cong_counters;
-	int sz = MLX5_ST_SZ_BYTES(ppcnt_reg);
-	int ret, i;
-	void *out;
-
-	out = kvzalloc(sz, GFP_KERNEL);
-	if (!out)
-		return -ENOMEM;
-
-	ret = mlx5_cmd_query_ext_ppcnt_counters(dev->mdev, out);
-	if (ret)
-		goto free;
-
-	for (i = 0; i < cnts->num_ext_ppcnt_counters; i++)
-		stats->value[i + offset] =
-			be64_to_cpup((__be64 *)(out +
-				    cnts->offsets[i + offset]));
-free:
-	kvfree(out);
-	return ret;
-}
-
-static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
-				struct rdma_hw_stats *stats,
-				u8 port_num, int index)
-{
-	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	const struct mlx5_ib_counters *cnts = get_counters(dev, port_num - 1);
-	struct mlx5_core_dev *mdev;
-	int ret, num_counters;
-	u8 mdev_port_num;
-
-	if (!stats)
-		return -EINVAL;
-
-	num_counters = cnts->num_q_counters +
-		       cnts->num_cong_counters +
-		       cnts->num_ext_ppcnt_counters;
-
-	/* q_counters are per IB device, query the master mdev */
-	ret = mlx5_ib_query_q_counters(dev->mdev, cnts, stats, cnts->set_id);
-	if (ret)
-		return ret;
-
-	if (MLX5_CAP_PCAM_FEATURE(dev->mdev, rx_icrc_encapsulated_counter)) {
-		ret =  mlx5_ib_query_ext_ppcnt_counters(dev, cnts, stats);
-		if (ret)
-			return ret;
-	}
-
-	if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) {
-		mdev = mlx5_ib_get_native_port_mdev(dev, port_num,
-						    &mdev_port_num);
-		if (!mdev) {
-			/* If port is not affiliated yet, its in down state
-			 * which doesn't have any counters yet, so it would be
-			 * zero. So no need to read from the HCA.
-			 */
-			goto done;
-		}
-		ret = mlx5_lag_query_cong_counters(dev->mdev,
-						   stats->value +
-						   cnts->num_q_counters,
-						   cnts->num_cong_counters,
-						   cnts->offsets +
-						   cnts->num_q_counters);
-
-		mlx5_ib_put_native_port_mdev(dev, port_num);
-		if (ret)
-			return ret;
-	}
-
-done:
-	return num_counters;
-}
-
-static struct rdma_hw_stats *
-mlx5_ib_counter_alloc_stats(struct rdma_counter *counter)
-{
-	struct mlx5_ib_dev *dev = to_mdev(counter->device);
-	const struct mlx5_ib_counters *cnts =
-		get_counters(dev, counter->port - 1);
-
-	return rdma_alloc_hw_stats_struct(cnts->names,
-					  cnts->num_q_counters +
-					  cnts->num_cong_counters +
-					  cnts->num_ext_ppcnt_counters,
-					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
-}
-
-static int mlx5_ib_counter_update_stats(struct rdma_counter *counter)
-{
-	struct mlx5_ib_dev *dev = to_mdev(counter->device);
-	const struct mlx5_ib_counters *cnts =
-		get_counters(dev, counter->port - 1);
-
-	return mlx5_ib_query_q_counters(dev->mdev, cnts,
-					counter->stats, counter->id);
-}
-
-static int mlx5_ib_counter_bind_qp(struct rdma_counter *counter,
-				   struct ib_qp *qp)
-{
-	struct mlx5_ib_dev *dev = to_mdev(qp->device);
-	u16 cnt_set_id = 0;
-	int err;
-
-	if (!counter->id) {
-		err = mlx5_cmd_alloc_q_counter(dev->mdev,
-					       &cnt_set_id,
-					       MLX5_SHARED_RESOURCE_UID);
-		if (err)
-			return err;
-		counter->id = cnt_set_id;
-	}
-
-	err = mlx5_ib_qp_set_counter(qp, counter);
-	if (err)
-		goto fail_set_counter;
-
-	return 0;
-
-fail_set_counter:
-	mlx5_core_dealloc_q_counter(dev->mdev, cnt_set_id);
-	counter->id = 0;
-
-	return err;
-}
-
-static int mlx5_ib_counter_unbind_qp(struct ib_qp *qp)
-{
-	return mlx5_ib_qp_set_counter(qp, NULL);
-}
-
-static int mlx5_ib_counter_dealloc(struct rdma_counter *counter)
-{
-	struct mlx5_ib_dev *dev = to_mdev(counter->device);
-
-	return mlx5_core_dealloc_q_counter(dev->mdev, counter->id);
+	mlx5_nic_vport_disable_roce(dev->mdev);
 }
 
 static int mlx5_ib_rn_get_params(struct ib_device *device, u8 port_num,
@@ -5706,24 +3358,6 @@
 	return mlx5_rdma_rn_get_params(to_mdev(device)->mdev, device, params);
 }
 
-static void delay_drop_debugfs_cleanup(struct mlx5_ib_dev *dev)
-{
-	if (!dev->delay_drop.dbg)
-		return;
-	debugfs_remove_recursive(dev->delay_drop.dbg->dir_debugfs);
-	kfree(dev->delay_drop.dbg);
-	dev->delay_drop.dbg = NULL;
-}
-
-static void cancel_delay_drop(struct mlx5_ib_dev *dev)
-{
-	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
-		return;
-
-	cancel_work_sync(&dev->delay_drop.delay_drop_work);
-	delay_drop_debugfs_cleanup(dev);
-}
-
 static ssize_t delay_drop_timeout_read(struct file *filp, char __user *buf,
 				       size_t count, loff_t *pos)
 {
@@ -5763,71 +3397,6 @@
 	.read	= delay_drop_timeout_read,
 };
 
-static int delay_drop_debugfs_init(struct mlx5_ib_dev *dev)
-{
-	struct mlx5_ib_dbg_delay_drop *dbg;
-
-	if (!mlx5_debugfs_root)
-		return 0;
-
-	dbg = kzalloc(sizeof(*dbg), GFP_KERNEL);
-	if (!dbg)
-		return -ENOMEM;
-
-	dev->delay_drop.dbg = dbg;
-
-	dbg->dir_debugfs =
-		debugfs_create_dir("delay_drop",
-				   dev->mdev->priv.dbg_root);
-	if (!dbg->dir_debugfs)
-		goto out_debugfs;
-
-	dbg->events_cnt_debugfs =
-		debugfs_create_atomic_t("num_timeout_events", 0400,
-					dbg->dir_debugfs,
-					&dev->delay_drop.events_cnt);
-	if (!dbg->events_cnt_debugfs)
-		goto out_debugfs;
-
-	dbg->rqs_cnt_debugfs =
-		debugfs_create_atomic_t("num_rqs", 0400,
-					dbg->dir_debugfs,
-					&dev->delay_drop.rqs_cnt);
-	if (!dbg->rqs_cnt_debugfs)
-		goto out_debugfs;
-
-	dbg->timeout_debugfs =
-		debugfs_create_file("timeout", 0600,
-				    dbg->dir_debugfs,
-				    &dev->delay_drop,
-				    &fops_delay_drop_timeout);
-	if (!dbg->timeout_debugfs)
-		goto out_debugfs;
-
-	return 0;
-
-out_debugfs:
-	delay_drop_debugfs_cleanup(dev);
-	return -ENOMEM;
-}
-
-static void init_delay_drop(struct mlx5_ib_dev *dev)
-{
-	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
-		return;
-
-	mutex_init(&dev->delay_drop.lock);
-	dev->delay_drop.dev = dev;
-	dev->delay_drop.activate = false;
-	dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000;
-	INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler);
-	atomic_set(&dev->delay_drop.rqs_cnt, 0);
-	atomic_set(&dev->delay_drop.events_cnt, 0);
-
-	if (delay_drop_debugfs_init(dev))
-		mlx5_ib_warn(dev, "Failed to init delay drop debugfs\n");
-}
-
 static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
 				      struct mlx5_ib_multiport_info *mpi)
 {
@@ -6037,6 +3606,269 @@
 	mlx5_nic_vport_disable_roce(dev->mdev);
 }
 
+static int mmap_obj_cleanup(struct ib_uobject *uobject,
+			    enum rdma_remove_reason why,
+			    struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_user_mmap_entry *obj = uobject->object;
+
+	rdma_user_mmap_entry_remove(&obj->rdma_entry);
+	return 0;
+}
+
+static int mlx5_rdma_user_mmap_entry_insert(struct mlx5_ib_ucontext *c,
+					    struct mlx5_user_mmap_entry *entry,
+					    size_t length)
+{
+	return rdma_user_mmap_entry_insert_range(
+		&c->ibucontext, &entry->rdma_entry, length,
+		(MLX5_IB_MMAP_OFFSET_START << 16),
+		((MLX5_IB_MMAP_OFFSET_END << 16) + (1UL << 16) - 1));
+}
+
+static struct mlx5_user_mmap_entry *
+alloc_var_entry(struct mlx5_ib_ucontext *c)
+{
+	struct mlx5_user_mmap_entry *entry;
+	struct mlx5_var_table *var_table;
+	u32 page_idx;
+	int err;
+
+	var_table = &to_mdev(c->ibucontext.device)->var_table;
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_lock(&var_table->bitmap_lock);
+	page_idx = find_first_zero_bit(var_table->bitmap,
+				       var_table->num_var_hw_entries);
+	if (page_idx >= var_table->num_var_hw_entries) {
+		err = -ENOSPC;
+		mutex_unlock(&var_table->bitmap_lock);
+		goto end;
+	}
+
+	set_bit(page_idx, var_table->bitmap);
+	mutex_unlock(&var_table->bitmap_lock);
+
+	entry->address = var_table->hw_start_addr +
+				(page_idx * var_table->stride_size);
+	entry->page_idx = page_idx;
+	entry->mmap_flag = MLX5_IB_MMAP_TYPE_VAR;
+
+	err = mlx5_rdma_user_mmap_entry_insert(c, entry,
+					       var_table->stride_size);
+	if (err)
+		goto err_insert;
+
+	return entry;
+
+err_insert:
+	mutex_lock(&var_table->bitmap_lock);
+	clear_bit(page_idx, var_table->bitmap);
+	mutex_unlock(&var_table->bitmap_lock);
+end:
+	kfree(entry);
+	return ERR_PTR(err);
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_VAR_OBJ_ALLOC)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+		attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE);
+	struct mlx5_ib_ucontext *c;
+	struct mlx5_user_mmap_entry *entry;
+	u64 mmap_offset;
+	u32 length;
+	int err;
+
+	c = to_mucontext(ib_uverbs_get_ucontext(attrs));
+	if (IS_ERR(c))
+		return PTR_ERR(c);
+
+	entry = alloc_var_entry(c);
+	if (IS_ERR(entry))
+		return PTR_ERR(entry);
+
+	mmap_offset = mlx5_entry_to_mmap_offset(entry);
+	length = entry->rdma_entry.npages * PAGE_SIZE;
+	uobj->object = entry;
+	uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE);
+
+	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET,
+			     &mmap_offset, sizeof(mmap_offset));
+	if (err)
+		return err;
+
+	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID,
+			     &entry->page_idx, sizeof(entry->page_idx));
+	if (err)
+		return err;
+
+	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH,
+			     &length, sizeof(length));
+	return err;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_VAR_OBJ_ALLOC,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_ALLOC_HANDLE,
+			MLX5_IB_OBJECT_VAR,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_PAGE_ID,
+			   UVERBS_ATTR_TYPE(u32),
+			   UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_LENGTH,
+			   UVERBS_ATTR_TYPE(u32),
+			   UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_VAR_OBJ_ALLOC_MMAP_OFFSET,
+			    UVERBS_ATTR_TYPE(u64),
+			    UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	MLX5_IB_METHOD_VAR_OBJ_DESTROY,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_VAR_OBJ_DESTROY_HANDLE,
+			MLX5_IB_OBJECT_VAR,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_VAR,
+			    UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_ALLOC),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_DESTROY));
+
+static bool var_is_supported(struct ib_device *device)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
+
+	return (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
+			MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q);
+}
+
+static struct mlx5_user_mmap_entry *
+alloc_uar_entry(struct mlx5_ib_ucontext *c,
+		enum mlx5_ib_uapi_uar_alloc_type alloc_type)
+{
+	struct mlx5_user_mmap_entry *entry;
+	struct mlx5_ib_dev *dev;
+	u32 uar_index;
+	int err;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return ERR_PTR(-ENOMEM);
+
+	dev = to_mdev(c->ibucontext.device);
+	err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index);
+	if (err)
+		goto end;
+
+	entry->page_idx = uar_index;
+	entry->address = uar_index2paddress(dev, uar_index);
+	if (alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
+		entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_WC;
+	else
+		entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_NC;
+
+	err = mlx5_rdma_user_mmap_entry_insert(c, entry, PAGE_SIZE);
+	if (err)
+		goto err_insert;
+
+	return entry;
+
+err_insert:
+	mlx5_cmd_free_uar(dev->mdev, uar_index);
+end:
+	kfree(entry);
+	return ERR_PTR(err);
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(
+		attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE);
+	enum mlx5_ib_uapi_uar_alloc_type alloc_type;
+	struct mlx5_ib_ucontext *c;
+	struct mlx5_user_mmap_entry *entry;
+	u64 mmap_offset;
+	u32 length;
+	int err;
+
+	c = to_mucontext(ib_uverbs_get_ucontext(attrs));
+	if (IS_ERR(c))
+		return PTR_ERR(c);
+
+	err = uverbs_get_const(&alloc_type, attrs,
+			       MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE);
+	if (err)
+		return err;
+
+	if (alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF &&
+	    alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC)
+		return -EOPNOTSUPP;
+
+	if (!to_mdev(c->ibucontext.device)->wc_support &&
+	    alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF)
+		return -EOPNOTSUPP;
+
+	entry = alloc_uar_entry(c, alloc_type);
+	if (IS_ERR(entry))
+		return PTR_ERR(entry);
+
+	mmap_offset = mlx5_entry_to_mmap_offset(entry);
+	length = entry->rdma_entry.npages * PAGE_SIZE;
+	uobj->object = entry;
+	uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE);
+
+	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET,
+			     &mmap_offset, sizeof(mmap_offset));
+	if (err)
+		return err;
+
+	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID,
+			     &entry->page_idx, sizeof(entry->page_idx));
+	if (err)
+		return err;
+
+	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH,
+			     &length, sizeof(length));
+	return err;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_UAR_OBJ_ALLOC,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE,
+			MLX5_IB_OBJECT_UAR,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE,
+			     enum mlx5_ib_uapi_uar_alloc_type,
+			     UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID,
+			   UVERBS_ATTR_TYPE(u32),
+			   UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH,
+			   UVERBS_ATTR_TYPE(u32),
+			   UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET,
+			    UVERBS_ATTR_TYPE(u64),
+			    UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	MLX5_IB_METHOD_UAR_OBJ_DESTROY,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_DESTROY_HANDLE,
+			MLX5_IB_OBJECT_UAR,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_UAR,
+			    UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_ALLOC),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_DESTROY));
+
 ADD_UVERBS_ATTRIBUTES_SIMPLE(
 	mlx5_ib_dm,
 	UVERBS_OBJECT_DM,
@@ -6058,96 +3890,39 @@
 	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_CREATE_FLOW_ACTION_FLAGS,
 			     enum mlx5_ib_uapi_flow_action_flags));
 
+ADD_UVERBS_ATTRIBUTES_SIMPLE(
+	mlx5_ib_query_context,
+	UVERBS_OBJECT_DEVICE,
+	UVERBS_METHOD_QUERY_CONTEXT,
+	UVERBS_ATTR_PTR_OUT(
+		MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX,
+		UVERBS_ATTR_STRUCT(struct mlx5_ib_alloc_ucontext_resp,
+				   dump_fill_mkey),
+		UA_MANDATORY));
+
 static const struct uapi_definition mlx5_ib_defs[] = {
-#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
 	UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
 	UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
-#endif
+	UAPI_DEF_CHAIN(mlx5_ib_qos_defs),
+	UAPI_DEF_CHAIN(mlx5_ib_std_types_defs),
 
 	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION,
 				&mlx5_ib_flow_action),
 	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm),
+	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR,
+				UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)),
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR),
 	{}
 };
 
-static int mlx5_ib_read_counters(struct ib_counters *counters,
-				 struct ib_counters_read_attr *read_attr,
-				 struct uverbs_attr_bundle *attrs)
-{
-	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
-	struct mlx5_read_counters_attr mread_attr = {};
-	struct mlx5_ib_flow_counters_desc *desc;
-	int ret, i;
-
-	mutex_lock(&mcounters->mcntrs_mutex);
-	if (mcounters->cntrs_max_index > read_attr->ncounters) {
-		ret = -EINVAL;
-		goto err_bound;
-	}
-
-	mread_attr.out = kcalloc(mcounters->counters_num, sizeof(u64),
-				 GFP_KERNEL);
-	if (!mread_attr.out) {
-		ret = -ENOMEM;
-		goto err_bound;
-	}
-
-	mread_attr.hw_cntrs_hndl = mcounters->hw_cntrs_hndl;
-	mread_attr.flags = read_attr->flags;
-	ret = mcounters->read_counters(counters->device, &mread_attr);
-	if (ret)
-		goto err_read;
-
-	/* do the pass over the counters data array to assign according to the
-	 * descriptions and indexing pairs
-	 */
-	desc = mcounters->counters_data;
-	for (i = 0; i < mcounters->ncounters; i++)
-		read_attr->counters_buff[desc[i].index] += mread_attr.out[desc[i].description];
-
-err_read:
-	kfree(mread_attr.out);
-err_bound:
-	mutex_unlock(&mcounters->mcntrs_mutex);
-	return ret;
-}
-
-static int mlx5_ib_destroy_counters(struct ib_counters *counters)
-{
-	struct mlx5_ib_mcounters *mcounters = to_mcounters(counters);
-
-	counters_clear_description(counters);
-	if (mcounters->hw_cntrs_hndl)
-		mlx5_fc_destroy(to_mdev(counters->device)->mdev,
-				mcounters->hw_cntrs_hndl);
-
-	kfree(mcounters);
-
-	return 0;
-}
-
-static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device,
-						   struct uverbs_attr_bundle *attrs)
-{
-	struct mlx5_ib_mcounters *mcounters;
-
-	mcounters = kzalloc(sizeof(*mcounters), GFP_KERNEL);
-	if (!mcounters)
-		return ERR_PTR(-ENOMEM);
-
-	mutex_init(&mcounters->mcntrs_mutex);
-
-	return &mcounters->ibcntrs;
-}
-
 static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
 {
 	mlx5_ib_cleanup_multiport_master(dev);
-	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
-		srcu_barrier(&dev->mr_srcu);
-		cleanup_srcu_struct(&dev->mr_srcu);
-	}
-
+	WARN_ON(!xa_empty(&dev->odp_mkeys));
+	cleanup_srcu_struct(&dev->odp_srcu);
+	mutex_destroy(&dev->cap_mask_mutex);
+	WARN_ON(!xa_empty(&dev->sig_mrs));
 	WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
 }
 
@@ -6195,20 +3970,21 @@
 	dev->ib_dev.phys_port_cnt	= dev->num_ports;
 	dev->ib_dev.num_comp_vectors    = mlx5_comp_vectors_count(mdev);
 	dev->ib_dev.dev.parent		= mdev->device;
+	dev->ib_dev.lag_flags		= RDMA_LAG_FLAGS_HASH_ALL_SLAVES;
+
+	err = init_srcu_struct(&dev->odp_srcu);
+	if (err)
+		goto err_mp;
 
 	mutex_init(&dev->cap_mask_mutex);
 	INIT_LIST_HEAD(&dev->qp_list);
 	spin_lock_init(&dev->reset_flow_resource_lock);
+	xa_init(&dev->odp_mkeys);
+	xa_init(&dev->sig_mrs);
+	atomic_set(&dev->mkey_var, 0);
 
 	spin_lock_init(&dev->dm.lock);
 	dev->dm.dev = mdev;
-
-	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
-		err = init_srcu_struct(&dev->mr_srcu);
-		if (err)
-			goto err_mp;
-	}
-
 	return 0;
 
 err_mp:
@@ -6216,21 +3992,16 @@
 	return err;
 }
 
-static int mlx5_ib_stage_flow_db_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_enable_driver(struct ib_device *dev)
 {
-	dev->flow_db = kzalloc(sizeof(*dev->flow_db), GFP_KERNEL);
+	struct mlx5_ib_dev *mdev = to_mdev(dev);
+	int ret;
 
-	if (!dev->flow_db)
-		return -ENOMEM;
+	ret = mlx5_ib_test_wc(mdev);
+	mlx5_ib_dbg(mdev, "Write-Combining %s",
+		    mdev->wc_support ? "supported" : "not supported");
 
-	mutex_init(&dev->flow_db->lock);
-
-	return 0;
-}
-
-static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
-{
-	kfree(dev->flow_db);
+	return ret;
 }
 
 static const struct ib_device_ops mlx5_ib_dev_ops = {
@@ -6246,9 +4017,7 @@
 	.attach_mcast = mlx5_ib_mcg_attach,
 	.check_mr_status = mlx5_ib_check_mr_status,
 	.create_ah = mlx5_ib_create_ah,
-	.create_counters = mlx5_ib_create_counters,
 	.create_cq = mlx5_ib_create_cq,
-	.create_flow = mlx5_ib_create_flow,
 	.create_qp = mlx5_ib_create_qp,
 	.create_srq = mlx5_ib_create_srq,
 	.dealloc_pd = mlx5_ib_dealloc_pd,
@@ -6256,30 +4025,29 @@
 	.del_gid = mlx5_ib_del_gid,
 	.dereg_mr = mlx5_ib_dereg_mr,
 	.destroy_ah = mlx5_ib_destroy_ah,
-	.destroy_counters = mlx5_ib_destroy_counters,
 	.destroy_cq = mlx5_ib_destroy_cq,
-	.destroy_flow = mlx5_ib_destroy_flow,
-	.destroy_flow_action = mlx5_ib_destroy_flow_action,
 	.destroy_qp = mlx5_ib_destroy_qp,
 	.destroy_srq = mlx5_ib_destroy_srq,
 	.detach_mcast = mlx5_ib_mcg_detach,
 	.disassociate_ucontext = mlx5_ib_disassociate_ucontext,
 	.drain_rq = mlx5_ib_drain_rq,
 	.drain_sq = mlx5_ib_drain_sq,
+	.enable_driver = mlx5_ib_enable_driver,
 	.get_dev_fw_str = get_dev_fw_str,
 	.get_dma_mr = mlx5_ib_get_dma_mr,
 	.get_link_layer = mlx5_ib_port_link_layer,
 	.map_mr_sg = mlx5_ib_map_mr_sg,
 	.map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
 	.mmap = mlx5_ib_mmap,
+	.mmap_free = mlx5_ib_mmap_free,
 	.modify_cq = mlx5_ib_modify_cq,
 	.modify_device = mlx5_ib_modify_device,
 	.modify_port = mlx5_ib_modify_port,
 	.modify_qp = mlx5_ib_modify_qp,
 	.modify_srq = mlx5_ib_modify_srq,
 	.poll_cq = mlx5_ib_poll_cq,
-	.post_recv = mlx5_ib_post_recv,
-	.post_send = mlx5_ib_post_send,
+	.post_recv = mlx5_ib_post_recv_nodrain,
+	.post_send = mlx5_ib_post_send_nodrain,
 	.post_srq_recv = mlx5_ib_post_srq_recv,
 	.process_mad = mlx5_ib_process_mad,
 	.query_ah = mlx5_ib_query_ah,
@@ -6288,30 +4056,27 @@
 	.query_pkey = mlx5_ib_query_pkey,
 	.query_qp = mlx5_ib_query_qp,
 	.query_srq = mlx5_ib_query_srq,
-	.read_counters = mlx5_ib_read_counters,
+	.query_ucontext = mlx5_ib_query_ucontext,
 	.reg_user_mr = mlx5_ib_reg_user_mr,
 	.req_notify_cq = mlx5_ib_arm_cq,
 	.rereg_user_mr = mlx5_ib_rereg_user_mr,
 	.resize_cq = mlx5_ib_resize_cq,
 
 	INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah),
+	INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs),
 	INIT_RDMA_OBJ_SIZE(ib_cq, mlx5_ib_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
 	INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
 };
 
-static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = {
-	.create_flow_action_esp = mlx5_ib_create_flow_action_esp,
-	.modify_flow_action_esp = mlx5_ib_modify_flow_action_esp,
-};
-
 static const struct ib_device_ops mlx5_ib_dev_ipoib_enhanced_ops = {
 	.rdma_netdev_get_params = mlx5_ib_rn_get_params,
 };
 
 static const struct ib_device_ops mlx5_ib_dev_sriov_ops = {
 	.get_vf_config = mlx5_ib_get_vf_config,
+	.get_vf_guid = mlx5_ib_get_vf_guid,
 	.get_vf_stats = mlx5_ib_get_vf_stats,
 	.set_vf_guid = mlx5_ib_set_vf_guid,
 	.set_vf_link_state = mlx5_ib_set_vf_link_state,
@@ -6320,11 +4085,15 @@
 static const struct ib_device_ops mlx5_ib_dev_mw_ops = {
 	.alloc_mw = mlx5_ib_alloc_mw,
 	.dealloc_mw = mlx5_ib_dealloc_mw,
+
+	INIT_RDMA_OBJ_SIZE(ib_mw, mlx5_ib_mw, ibmw),
 };
 
 static const struct ib_device_ops mlx5_ib_dev_xrc_ops = {
 	.alloc_xrcd = mlx5_ib_alloc_xrcd,
 	.dealloc_xrcd = mlx5_ib_dealloc_xrcd,
+
+	INIT_RDMA_OBJ_SIZE(ib_xrcd, mlx5_ib_xrcd, ibxrcd),
 };
 
 static const struct ib_device_ops mlx5_ib_dev_dm_ops = {
@@ -6333,6 +4102,36 @@
 	.reg_dm_mr = mlx5_ib_reg_dm_mr,
 };
 
+static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_var_table *var_table = &dev->var_table;
+	u8 log_doorbell_bar_size;
+	u8 log_doorbell_stride;
+	u64 bar_size;
+
+	log_doorbell_bar_size = MLX5_CAP_DEV_VDPA_EMULATION(mdev,
+					log_doorbell_bar_size);
+	log_doorbell_stride = MLX5_CAP_DEV_VDPA_EMULATION(mdev,
+					log_doorbell_stride);
+	var_table->hw_start_addr = dev->mdev->bar_addr +
+				MLX5_CAP64_DEV_VDPA_EMULATION(mdev,
+					doorbell_bar_offset);
+	bar_size = (1ULL << log_doorbell_bar_size) * 4096;
+	var_table->stride_size = 1ULL << log_doorbell_stride;
+	var_table->num_var_hw_entries = div_u64(bar_size,
+						var_table->stride_size);
+	mutex_init(&var_table->bitmap_lock);
+	var_table->bitmap = bitmap_zalloc(var_table->num_var_hw_entries,
+					  GFP_KERNEL);
+	return (var_table->bitmap) ? 0 : -ENOMEM;
+}
+
+static void mlx5_ib_stage_caps_cleanup(struct mlx5_ib_dev *dev)
+{
+	bitmap_free(dev->var_table.bitmap);
+}
+
 static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_core_dev *mdev = dev->mdev;
@@ -6403,9 +4202,6 @@
 	    MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM)
 		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops);
 
-	if (mlx5_accel_ipsec_device_caps(dev->mdev) &
-	    MLX5_ACCEL_IPSEC_CAP_DEVICE)
-		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_flow_ipsec_ops);
 	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_ops);
 
 	if (IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS))
@@ -6420,6 +4216,13 @@
 	     MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
 		mutex_init(&dev->lb.mutex);
 
+	if (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
+			MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q) {
+		err = mlx5_ib_init_var_table(dev);
+		if (err)
+			return err;
+	}
+
 	dev->ib_dev.use_cq_dim = true;
 
 	return 0;
@@ -6441,7 +4244,7 @@
 	.query_port = mlx5_ib_rep_query_port,
 };
 
-static int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_raw_eth_non_default_cb(struct mlx5_ib_dev *dev)
 {
 	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_rep_ops);
 	return 0;
@@ -6454,67 +4257,41 @@
 	.destroy_wq = mlx5_ib_destroy_wq,
 	.get_netdev = mlx5_ib_get_netdev,
 	.modify_wq = mlx5_ib_modify_wq,
+
+	INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx5_ib_rwq_ind_table,
+			   ib_rwq_ind_tbl),
 };
 
-static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev)
-{
-	u8 port_num;
-
-	dev->ib_dev.uverbs_ex_cmd_mask |=
-			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
-			(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
-			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
-			(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
-			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
-	ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops);
-
-	port_num = mlx5_core_native_port_num(dev->mdev) - 1;
-
-	/* Register only for native ports */
-	return mlx5_add_netdev_notifier(dev, port_num);
-}
-
-static void mlx5_ib_stage_common_roce_cleanup(struct mlx5_ib_dev *dev)
-{
-	u8 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
-
-	mlx5_remove_netdev_notifier(dev, port_num);
-}
-
-static int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_core_dev *mdev = dev->mdev;
 	enum rdma_link_layer ll;
 	int port_type_cap;
-	int err = 0;
-
-	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
-	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
-
-	if (ll == IB_LINK_LAYER_ETHERNET)
-		err = mlx5_ib_stage_common_roce_init(dev);
-
-	return err;
-}
-
-static void mlx5_ib_stage_rep_roce_cleanup(struct mlx5_ib_dev *dev)
-{
-	mlx5_ib_stage_common_roce_cleanup(dev);
-}
-
-static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev)
-{
-	struct mlx5_core_dev *mdev = dev->mdev;
-	enum rdma_link_layer ll;
-	int port_type_cap;
+	u8 port_num = 0;
 	int err;
 
 	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
 	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
 
 	if (ll == IB_LINK_LAYER_ETHERNET) {
-		err = mlx5_ib_stage_common_roce_init(dev);
-		if (err)
+		dev->ib_dev.uverbs_ex_cmd_mask |=
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
+			(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
+		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_common_roce_ops);
+
+		port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+
+		/* Register only for native ports */
+		err = mlx5_add_netdev_notifier(dev, port_num);
+		if (err || dev->is_rep || !mlx5_is_roce_enabled(mdev))
+			/*
+			 * We don't enable ETH interface for
+			 * 1. IB representors
+			 * 2. User disabled ROCE through devlink interface
+			 */
 			return err;
 
 		err = mlx5_enable_eth(dev);
@@ -6524,73 +4301,29 @@
 
 	return 0;
 cleanup:
-	mlx5_ib_stage_common_roce_cleanup(dev);
-
+	mlx5_remove_netdev_notifier(dev, port_num);
 	return err;
 }
 
-static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_roce_cleanup(struct mlx5_ib_dev *dev)
 {
 	struct mlx5_core_dev *mdev = dev->mdev;
 	enum rdma_link_layer ll;
 	int port_type_cap;
+	u8 port_num;
 
 	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
 	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
 
 	if (ll == IB_LINK_LAYER_ETHERNET) {
-		mlx5_disable_eth(dev);
-		mlx5_ib_stage_common_roce_cleanup(dev);
+		if (!dev->is_rep)
+			mlx5_disable_eth(dev);
+
+		port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+		mlx5_remove_netdev_notifier(dev, port_num);
 	}
 }
 
-static int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev)
-{
-	return create_dev_resources(&dev->devr);
-}
-
-static void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev)
-{
-	destroy_dev_resources(&dev->devr);
-}
-
-static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev)
-{
-	return mlx5_ib_odp_init_one(dev);
-}
-
-static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
-{
-	mlx5_ib_odp_cleanup_one(dev);
-}
-
-static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = {
-	.alloc_hw_stats = mlx5_ib_alloc_hw_stats,
-	.get_hw_stats = mlx5_ib_get_hw_stats,
-	.counter_bind_qp = mlx5_ib_counter_bind_qp,
-	.counter_unbind_qp = mlx5_ib_counter_unbind_qp,
-	.counter_dealloc = mlx5_ib_counter_dealloc,
-	.counter_alloc_stats = mlx5_ib_counter_alloc_stats,
-	.counter_update_stats = mlx5_ib_counter_update_stats,
-};
-
-static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
-{
-	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
-		ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_hw_stats_ops);
-
-		return mlx5_ib_alloc_counters(dev);
-	}
-
-	return 0;
-}
-
-static void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev)
-{
-	if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
-		mlx5_ib_dealloc_counters(dev);
-}
-
 static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev)
 {
 	mlx5_ib_init_cong_debugfs(dev,
@@ -6645,12 +4378,23 @@
 		name = "mlx5_%d";
 	else
 		name = "mlx5_bond_%d";
-	return ib_register_device(&dev->ib_dev, name);
+	return ib_register_device(&dev->ib_dev, name, &dev->mdev->pdev->dev);
 }
 
 static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
 {
-	destroy_umrc_res(dev);
+	int err;
+
+	err = mlx5_mr_cache_cleanup(dev);
+	if (err)
+		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
+
+	if (dev->umrc.qp)
+		mlx5_ib_destroy_qp(dev->umrc.qp, NULL);
+	if (dev->umrc.cq)
+		ib_free_cq(dev->umrc.cq);
+	if (dev->umrc.pd)
+		ib_dealloc_pd(dev->umrc.pd);
 }
 
 static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
@@ -6658,21 +4402,162 @@
 	ib_unregister_device(&dev->ib_dev);
 }
 
+enum {
+	MAX_UMR_WR = 128,
+};
+
 static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
 {
-	return create_umr_res(dev);
+	struct ib_qp_init_attr *init_attr = NULL;
+	struct ib_qp_attr *attr = NULL;
+	struct ib_pd *pd;
+	struct ib_cq *cq;
+	struct ib_qp *qp;
+	int ret;
+
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
+	if (!attr || !init_attr) {
+		ret = -ENOMEM;
+		goto error_0;
+	}
+
+	pd = ib_alloc_pd(&dev->ib_dev, 0);
+	if (IS_ERR(pd)) {
+		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
+		ret = PTR_ERR(pd);
+		goto error_0;
+	}
+
+	cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
+	if (IS_ERR(cq)) {
+		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
+		ret = PTR_ERR(cq);
+		goto error_2;
+	}
+
+	init_attr->send_cq = cq;
+	init_attr->recv_cq = cq;
+	init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+	init_attr->cap.max_send_wr = MAX_UMR_WR;
+	init_attr->cap.max_send_sge = 1;
+	init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
+	init_attr->port_num = 1;
+	qp = mlx5_ib_create_qp(pd, init_attr, NULL);
+	if (IS_ERR(qp)) {
+		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
+		ret = PTR_ERR(qp);
+		goto error_3;
+	}
+	qp->device     = &dev->ib_dev;
+	qp->real_qp    = qp;
+	qp->uobject    = NULL;
+	qp->qp_type    = MLX5_IB_QPT_REG_UMR;
+	qp->send_cq    = init_attr->send_cq;
+	qp->recv_cq    = init_attr->recv_cq;
+
+	attr->qp_state = IB_QPS_INIT;
+	attr->port_num = 1;
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
+				IB_QP_PORT, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
+		goto error_4;
+	}
+
+	memset(attr, 0, sizeof(*attr));
+	attr->qp_state = IB_QPS_RTR;
+	attr->path_mtu = IB_MTU_256;
+
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
+		goto error_4;
+	}
+
+	memset(attr, 0, sizeof(*attr));
+	attr->qp_state = IB_QPS_RTS;
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
+		goto error_4;
+	}
+
+	dev->umrc.qp = qp;
+	dev->umrc.cq = cq;
+	dev->umrc.pd = pd;
+
+	sema_init(&dev->umrc.sem, MAX_UMR_WR);
+	ret = mlx5_mr_cache_init(dev);
+	if (ret) {
+		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
+		goto error_4;
+	}
+
+	kfree(attr);
+	kfree(init_attr);
+
+	return 0;
+
+error_4:
+	mlx5_ib_destroy_qp(qp, NULL);
+	dev->umrc.qp = NULL;
+
+error_3:
+	ib_free_cq(cq);
+	dev->umrc.cq = NULL;
+
+error_2:
+	ib_dealloc_pd(pd);
+	dev->umrc.pd = NULL;
+
+error_0:
+	kfree(attr);
+	kfree(init_attr);
+	return ret;
 }
 
 static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev)
 {
-	init_delay_drop(dev);
+	struct dentry *root;
 
+	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
+		return 0;
+
+	mutex_init(&dev->delay_drop.lock);
+	dev->delay_drop.dev = dev;
+	dev->delay_drop.activate = false;
+	dev->delay_drop.timeout = MLX5_MAX_DELAY_DROP_TIMEOUT_MS * 1000;
+	INIT_WORK(&dev->delay_drop.delay_drop_work, delay_drop_handler);
+	atomic_set(&dev->delay_drop.rqs_cnt, 0);
+	atomic_set(&dev->delay_drop.events_cnt, 0);
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	root = debugfs_create_dir("delay_drop", dev->mdev->priv.dbg_root);
+	dev->delay_drop.dir_debugfs = root;
+
+	debugfs_create_atomic_t("num_timeout_events", 0400, root,
+				&dev->delay_drop.events_cnt);
+	debugfs_create_atomic_t("num_rqs", 0400, root,
+				&dev->delay_drop.rqs_cnt);
+	debugfs_create_file("timeout", 0600, root, &dev->delay_drop,
+			    &fops_delay_drop_timeout);
 	return 0;
 }
 
 static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
 {
-	cancel_delay_drop(dev);
+	if (!(dev->ib_dev.attrs.raw_packet_caps & IB_RAW_PACKET_CAP_DELAY_DROP))
+		return;
+
+	cancel_work_sync(&dev->delay_drop.delay_drop_work);
+	if (!dev->delay_drop.dir_debugfs)
+		return;
+
+	debugfs_remove_recursive(dev->delay_drop.dir_debugfs);
+	dev->delay_drop.dir_debugfs = NULL;
 }
 
 static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
@@ -6687,30 +4572,12 @@
 	mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
 }
 
-static int mlx5_ib_stage_devx_init(struct mlx5_ib_dev *dev)
-{
-	int uid;
-
-	uid = mlx5_ib_devx_create(dev, false);
-	if (uid > 0) {
-		dev->devx_whitelist_uid = uid;
-		mlx5_ib_devx_init_event_table(dev);
-	}
-
-	return 0;
-}
-static void mlx5_ib_stage_devx_cleanup(struct mlx5_ib_dev *dev)
-{
-	if (dev->devx_whitelist_uid) {
-		mlx5_ib_devx_cleanup_event_table(dev);
-		mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid);
-	}
-}
-
 void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
 		      const struct mlx5_ib_profile *profile,
 		      int stage)
 {
+	dev->ib_active = false;
+
 	/* Number of stages to cleanup */
 	while (stage) {
 		stage--;
@@ -6728,6 +4595,8 @@
 	int err;
 	int i;
 
+	dev->profile = profile;
+
 	for (i = 0; i < MLX5_IB_STAGE_MAX; i++) {
 		if (profile->stage[i].init) {
 			err = profile->stage[i].init(dev);
@@ -6736,7 +4605,6 @@
 		}
 	}
 
-	dev->profile = profile;
 	dev->ib_active = true;
 
 	return dev;
@@ -6751,33 +4619,36 @@
 	STAGE_CREATE(MLX5_IB_STAGE_INIT,
 		     mlx5_ib_stage_init_init,
 		     mlx5_ib_stage_init_cleanup),
-	STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
-		     mlx5_ib_stage_flow_db_init,
-		     mlx5_ib_stage_flow_db_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_FS,
+		     mlx5_ib_fs_init,
+		     mlx5_ib_fs_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_CAPS,
 		     mlx5_ib_stage_caps_init,
-		     NULL),
+		     mlx5_ib_stage_caps_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
 		     mlx5_ib_stage_non_default_cb,
 		     NULL),
 	STAGE_CREATE(MLX5_IB_STAGE_ROCE,
-		     mlx5_ib_stage_roce_init,
-		     mlx5_ib_stage_roce_cleanup),
+		     mlx5_ib_roce_init,
+		     mlx5_ib_roce_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_QP,
+		     mlx5_init_qp_table,
+		     mlx5_cleanup_qp_table),
 	STAGE_CREATE(MLX5_IB_STAGE_SRQ,
 		     mlx5_init_srq_table,
 		     mlx5_cleanup_srq_table),
 	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
-		     mlx5_ib_stage_dev_res_init,
-		     mlx5_ib_stage_dev_res_cleanup),
+		     mlx5_ib_dev_res_init,
+		     mlx5_ib_dev_res_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
 		     mlx5_ib_stage_dev_notifier_init,
 		     mlx5_ib_stage_dev_notifier_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_ODP,
-		     mlx5_ib_stage_odp_init,
-		     mlx5_ib_stage_odp_cleanup),
+		     mlx5_ib_odp_init_one,
+		     mlx5_ib_odp_cleanup_one),
 	STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
-		     mlx5_ib_stage_counters_init,
-		     mlx5_ib_stage_counters_cleanup),
+		     mlx5_ib_counters_init,
+		     mlx5_ib_counters_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
 		     mlx5_ib_stage_cong_debugfs_init,
 		     mlx5_ib_stage_cong_debugfs_cleanup),
@@ -6791,8 +4662,8 @@
 		     NULL,
 		     mlx5_ib_stage_pre_ib_reg_umr_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
-		     mlx5_ib_stage_devx_init,
-		     mlx5_ib_stage_devx_cleanup),
+		     mlx5_ib_devx_init,
+		     mlx5_ib_devx_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
 		     mlx5_ib_stage_ib_reg_init,
 		     mlx5_ib_stage_ib_reg_cleanup),
@@ -6802,36 +4673,45 @@
 	STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP,
 		     mlx5_ib_stage_delay_drop_init,
 		     mlx5_ib_stage_delay_drop_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_RESTRACK,
+		     mlx5_ib_restrack_init,
+		     NULL),
 };
 
-const struct mlx5_ib_profile uplink_rep_profile = {
+const struct mlx5_ib_profile raw_eth_profile = {
 	STAGE_CREATE(MLX5_IB_STAGE_INIT,
 		     mlx5_ib_stage_init_init,
 		     mlx5_ib_stage_init_cleanup),
-	STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
-		     mlx5_ib_stage_flow_db_init,
-		     mlx5_ib_stage_flow_db_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_FS,
+		     mlx5_ib_fs_init,
+		     mlx5_ib_fs_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_CAPS,
 		     mlx5_ib_stage_caps_init,
-		     NULL),
+		     mlx5_ib_stage_caps_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
-		     mlx5_ib_stage_rep_non_default_cb,
+		     mlx5_ib_stage_raw_eth_non_default_cb,
 		     NULL),
 	STAGE_CREATE(MLX5_IB_STAGE_ROCE,
-		     mlx5_ib_stage_rep_roce_init,
-		     mlx5_ib_stage_rep_roce_cleanup),
+		     mlx5_ib_roce_init,
+		     mlx5_ib_roce_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_QP,
+		     mlx5_init_qp_table,
+		     mlx5_cleanup_qp_table),
 	STAGE_CREATE(MLX5_IB_STAGE_SRQ,
 		     mlx5_init_srq_table,
 		     mlx5_cleanup_srq_table),
 	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
-		     mlx5_ib_stage_dev_res_init,
-		     mlx5_ib_stage_dev_res_cleanup),
+		     mlx5_ib_dev_res_init,
+		     mlx5_ib_dev_res_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
 		     mlx5_ib_stage_dev_notifier_init,
 		     mlx5_ib_stage_dev_notifier_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
-		     mlx5_ib_stage_counters_init,
-		     mlx5_ib_stage_counters_cleanup),
+		     mlx5_ib_counters_init,
+		     mlx5_ib_counters_cleanup),
+	STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS,
+		     mlx5_ib_stage_cong_debugfs_init,
+		     mlx5_ib_stage_cong_debugfs_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_UAR,
 		     mlx5_ib_stage_uar_init,
 		     mlx5_ib_stage_uar_cleanup),
@@ -6842,14 +4722,17 @@
 		     NULL,
 		     mlx5_ib_stage_pre_ib_reg_umr_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
-		     mlx5_ib_stage_devx_init,
-		     mlx5_ib_stage_devx_cleanup),
+		     mlx5_ib_devx_init,
+		     mlx5_ib_devx_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
 		     mlx5_ib_stage_ib_reg_init,
 		     mlx5_ib_stage_ib_reg_cleanup),
 	STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
 		     mlx5_ib_stage_post_ib_reg_umr_init,
 		     NULL),
+	STAGE_CREATE(MLX5_IB_STAGE_RESTRACK,
+		     mlx5_ib_restrack_init,
+		     NULL),
 };
 
 static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev)
@@ -6896,13 +4779,12 @@
 
 static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
 {
+	const struct mlx5_ib_profile *profile;
 	enum rdma_link_layer ll;
 	struct mlx5_ib_dev *dev;
 	int port_type_cap;
 	int num_ports;
 
-	printk_once(KERN_INFO "%s", mlx5_version);
-
 	if (MLX5_ESWITCH_MANAGER(mdev) &&
 	    mlx5_ib_eswitch_mode(mdev->priv.eswitch) == MLX5_ESWITCH_OFFLOADS) {
 		if (!mlx5_core_mp_enabled(mdev))
@@ -6931,7 +4813,12 @@
 	dev->mdev = mdev;
 	dev->num_ports = num_ports;
 
-	return __mlx5_ib_add(dev, &pf_profile);
+	if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_is_roce_enabled(mdev))
+		profile = &raw_eth_profile;
+	else
+		profile = &pf_profile;
+
+	return __mlx5_ib_add(dev, profile);
 }
 
 static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
index b5aece7..13de3d2 100644
--- a/drivers/infiniband/hw/mlx5/mem.c
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -34,6 +34,7 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_umem_odp.h>
 #include "mlx5_ib.h"
+#include <linux/jiffies.h>
 
 /* @umem: umem object to scan
  * @addr: ib virtual address requested by the user
@@ -100,18 +101,6 @@
 	*count = i;
 }
 
-static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
-{
-	u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
-
-	if (umem_dma & ODP_READ_ALLOWED_BIT)
-		mtt_entry |= MLX5_IB_MTT_READ;
-	if (umem_dma & ODP_WRITE_ALLOWED_BIT)
-		mtt_entry |= MLX5_IB_MTT_WRITE;
-
-	return mtt_entry;
-}
-
 /*
  * Populate the given array with bus addresses from the umem.
  *
@@ -138,19 +127,6 @@
 	struct scatterlist *sg;
 	int entry;
 
-	if (umem->is_odp) {
-		WARN_ON(shift != 0);
-		WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE));
-
-		for (i = 0; i < num_pages; ++i) {
-			dma_addr_t pa =
-				to_ib_umem_odp(umem)->dma_list[offset + i];
-
-			pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
-		}
-		return;
-	}
-
 	i = 0;
 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
 		len = sg_dma_len(sg) >> PAGE_SHIFT;
@@ -193,8 +169,8 @@
 			  int page_shift, __be64 *pas, int access_flags)
 {
 	return __mlx5_ib_populate_pas(dev, umem, page_shift, 0,
-				      ib_umem_num_pages(umem), pas,
-				      access_flags);
+				      ib_umem_num_dma_blocks(umem, PAGE_SIZE),
+				      pas, access_flags);
 }
 int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset)
 {
@@ -216,3 +192,201 @@
 	*offset = buf_off >> ilog2(off_size);
 	return 0;
 }
+
+#define WR_ID_BF 0xBF
+#define WR_ID_END 0xBAD
+#define TEST_WC_NUM_WQES 255
+#define TEST_WC_POLLING_MAX_TIME_JIFFIES msecs_to_jiffies(100)
+static int post_send_nop(struct mlx5_ib_dev *dev, struct ib_qp *ibqp, u64 wr_id,
+			 bool signaled)
+{
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_wqe_ctrl_seg *ctrl;
+	struct mlx5_bf *bf = &qp->bf;
+	__be32 mmio_wqe[16] = {};
+	unsigned long flags;
+	unsigned int idx;
+	int i;
+
+	if (unlikely(dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR))
+		return -EIO;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
+	ctrl = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx);
+
+	memset(ctrl, 0, sizeof(struct mlx5_wqe_ctrl_seg));
+	ctrl->fm_ce_se = signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0;
+	ctrl->opmod_idx_opcode =
+		cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | MLX5_OPCODE_NOP);
+	ctrl->qpn_ds = cpu_to_be32((sizeof(struct mlx5_wqe_ctrl_seg) / 16) |
+				   (qp->trans_qp.base.mqp.qpn << 8));
+
+	qp->sq.wrid[idx] = wr_id;
+	qp->sq.w_list[idx].opcode = MLX5_OPCODE_NOP;
+	qp->sq.wqe_head[idx] = qp->sq.head + 1;
+	qp->sq.cur_post += DIV_ROUND_UP(sizeof(struct mlx5_wqe_ctrl_seg),
+					MLX5_SEND_WQE_BB);
+	qp->sq.w_list[idx].next = qp->sq.cur_post;
+	qp->sq.head++;
+
+	memcpy(mmio_wqe, ctrl, sizeof(*ctrl));
+	((struct mlx5_wqe_ctrl_seg *)&mmio_wqe)->fm_ce_se |=
+		MLX5_WQE_CTRL_CQ_UPDATE;
+
+	/* Make sure that descriptors are written before
+	 * updating doorbell record and ringing the doorbell
+	 */
+	wmb();
+
+	qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post);
+
+	/* Make sure doorbell record is visible to the HCA before
+	 * we hit doorbell
+	 */
+	wmb();
+	for (i = 0; i < 8; i++)
+		mlx5_write64(&mmio_wqe[i * 2],
+			     bf->bfreg->map + bf->offset + i * 8);
+
+	bf->offset ^= bf->buf_size;
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+	return 0;
+}
+
+static int test_wc_poll_cq_result(struct mlx5_ib_dev *dev, struct ib_cq *cq)
+{
+	int ret;
+	struct ib_wc wc = {};
+	unsigned long end = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES;
+
+	do {
+		ret = ib_poll_cq(cq, 1, &wc);
+		if (ret < 0 || wc.status)
+			return ret < 0 ? ret : -EINVAL;
+		if (ret)
+			break;
+	} while (!time_after(jiffies, end));
+
+	if (!ret)
+		return -ETIMEDOUT;
+
+	if (wc.wr_id != WR_ID_BF)
+		ret = 0;
+
+	return ret;
+}
+
+static int test_wc_do_send(struct mlx5_ib_dev *dev, struct ib_qp *qp)
+{
+	int err, i;
+
+	for (i = 0; i < TEST_WC_NUM_WQES; i++) {
+		err = post_send_nop(dev, qp, WR_ID_BF, false);
+		if (err)
+			return err;
+	}
+
+	return post_send_nop(dev, qp, WR_ID_END, true);
+}
+
+int mlx5_ib_test_wc(struct mlx5_ib_dev *dev)
+{
+	struct ib_cq_init_attr cq_attr = { .cqe = TEST_WC_NUM_WQES + 1 };
+	int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
+	struct ib_qp_init_attr qp_init_attr = {
+		.cap = { .max_send_wr = TEST_WC_NUM_WQES },
+		.qp_type = IB_QPT_UD,
+		.sq_sig_type = IB_SIGNAL_REQ_WR,
+		.create_flags = MLX5_IB_QP_CREATE_WC_TEST,
+	};
+	struct ib_qp_attr qp_attr = { .port_num = 1 };
+	struct ib_device *ibdev = &dev->ib_dev;
+	struct ib_qp *qp;
+	struct ib_cq *cq;
+	struct ib_pd *pd;
+	int ret;
+
+	if (!MLX5_CAP_GEN(dev->mdev, bf))
+		return 0;
+
+	if (!dev->mdev->roce.roce_en &&
+	    port_type_cap == MLX5_CAP_PORT_TYPE_ETH) {
+		if (mlx5_core_is_pf(dev->mdev))
+			dev->wc_support = arch_can_pci_mmap_wc();
+		return 0;
+	}
+
+	ret = mlx5_alloc_bfreg(dev->mdev, &dev->wc_bfreg, true, false);
+	if (ret)
+		goto print_err;
+
+	if (!dev->wc_bfreg.wc)
+		goto out1;
+
+	pd = ib_alloc_pd(ibdev, 0);
+	if (IS_ERR(pd)) {
+		ret = PTR_ERR(pd);
+		goto out1;
+	}
+
+	cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr);
+	if (IS_ERR(cq)) {
+		ret = PTR_ERR(cq);
+		goto out2;
+	}
+
+	qp_init_attr.recv_cq = cq;
+	qp_init_attr.send_cq = cq;
+	qp = ib_create_qp(pd, &qp_init_attr);
+	if (IS_ERR(qp)) {
+		ret = PTR_ERR(qp);
+		goto out3;
+	}
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = ib_modify_qp(qp, &qp_attr,
+			   IB_QP_STATE | IB_QP_PORT | IB_QP_PKEY_INDEX |
+				   IB_QP_QKEY);
+	if (ret)
+		goto out4;
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+	if (ret)
+		goto out4;
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
+	if (ret)
+		goto out4;
+
+	ret = test_wc_do_send(dev, qp);
+	if (ret < 0)
+		goto out4;
+
+	ret = test_wc_poll_cq_result(dev, cq);
+	if (ret > 0) {
+		dev->wc_support = true;
+		ret = 0;
+	}
+
+out4:
+	ib_destroy_qp(qp);
+out3:
+	ib_destroy_cq(cq);
+out2:
+	ib_dealloc_pd(pd);
+out1:
+	mlx5_free_bfreg(dev->mdev, &dev->wc_bfreg);
+print_err:
+	if (ret)
+		mlx5_ib_err(
+			dev,
+			"Error %d while trying to test write-combining support\n",
+			ret);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index a9ce46c..b1f2b34 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1,33 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
 /*
- * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses.  You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
  */
 
 #ifndef MLX5_IB_H
@@ -64,14 +37,17 @@
 	dev_warn(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__,     \
 		 __LINE__, current->pid, ##arg)
 
-#define field_avail(type, fld, sz) (offsetof(type, fld) +		\
-				    sizeof(((type *)0)->fld) <= (sz))
 #define MLX5_IB_DEFAULT_UIDX 0xffffff
 #define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index)
 
 #define MLX5_MKEY_PAGE_SHIFT_MASK __mlx5_mask(mkc, log_page_size)
 
 enum {
+	MLX5_IB_MMAP_OFFSET_START = 9,
+	MLX5_IB_MMAP_OFFSET_END = 255,
+};
+
+enum {
 	MLX5_IB_MMAP_CMD_SHIFT	= 8,
 	MLX5_IB_MMAP_CMD_MASK	= 0xff,
 };
@@ -118,9 +94,30 @@
 	MLX5_MEMIC_BASE_SIZE	= 1 << MLX5_MEMIC_BASE_ALIGN,
 };
 
-#define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)                                        \
-	(MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity))
-#define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev))
+enum mlx5_ib_mmap_type {
+	MLX5_IB_MMAP_TYPE_MEMIC = 1,
+	MLX5_IB_MMAP_TYPE_VAR = 2,
+	MLX5_IB_MMAP_TYPE_UAR_WC = 3,
+	MLX5_IB_MMAP_TYPE_UAR_NC = 4,
+};
+
+struct mlx5_bfreg_info {
+	u32 *sys_pages;
+	int num_low_latency_bfregs;
+	unsigned int *count;
+
+	/*
+	 * protect bfreg allocation data structs
+	 */
+	struct mutex lock;
+	u32 ver;
+	u8 lib_uar_4k : 1;
+	u8 lib_uar_dyn : 1;
+	u32 num_sys_pages;
+	u32 num_static_sys_pages;
+	u32 total_num_bfregs;
+	u32 num_dyn_bfregs;
+};
 
 struct mlx5_ib_ucontext {
 	struct ib_ucontext	ibucontext;
@@ -135,7 +132,6 @@
 	u32			tdn;
 
 	u64			lib_caps;
-	DECLARE_BITMAP(dm_pages, MLX5_MAX_MEMIC_PAGES);
 	u16			devx_uid;
 	/* For RoCE LAG TX affinity */
 	atomic_t		tx_port_affinity;
@@ -194,6 +190,11 @@
 	u8			match_criteria_enable;
 };
 
+struct mlx5_ib_pp {
+	u16 index;
+	struct mlx5_core_dev *mdev;
+};
+
 struct mlx5_ib_flow_db {
 	struct mlx5_ib_flow_prio	prios[MLX5_IB_NUM_FLOW_FT];
 	struct mlx5_ib_flow_prio	egress_prios[MLX5_IB_NUM_FLOW_FT];
@@ -201,6 +202,7 @@
 	struct mlx5_ib_flow_prio	egress[MLX5_IB_NUM_EGRESS_FTS];
 	struct mlx5_ib_flow_prio	fdb;
 	struct mlx5_ib_flow_prio	rdma_rx[MLX5_IB_NUM_FLOW_FT];
+	struct mlx5_ib_flow_prio	rdma_tx[MLX5_IB_NUM_FLOW_FT];
 	struct mlx5_flow_table		*lag_demux_ft;
 	/* Protect flow steering bypass flow tables
 	 * when add/del flow rules.
@@ -247,12 +249,8 @@
  * These flags are intended for internal use by the mlx5_ib driver, and they
  * rely on the range reserved for that use in the ib_qp_create_flags enum.
  */
-
-/* Create a UD QP whose source QP number is 1 */
-static inline enum ib_qp_create_flags mlx5_ib_create_qp_sqpn_qp1(void)
-{
-	return IB_QP_CREATE_RESERVED_START;
-}
+#define MLX5_IB_QP_CREATE_SQPN_QP1	IB_QP_CREATE_RESERVED_START
+#define MLX5_IB_QP_CREATE_WC_TEST	(IB_QP_CREATE_RESERVED_START << 1)
 
 struct wr_list {
 	u16	opcode;
@@ -296,6 +294,7 @@
 #define MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES 16
 #define MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES 6
 #define MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES 13
+#define MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES 3
 
 struct mlx5_ib_rwq {
 	struct ib_wq		ibwq;
@@ -311,7 +310,6 @@
 	struct ib_umem		*umem;
 	size_t			buf_size;
 	unsigned int		page_shift;
-	int			create_type;
 	struct mlx5_db		db;
 	u32			user_index;
 	u32			wqe_count;
@@ -320,17 +318,6 @@
 	u32			create_flags; /* Use enum mlx5_ib_wq_flags */
 };
 
-enum {
-	MLX5_QP_USER,
-	MLX5_QP_KERNEL,
-	MLX5_QP_EMPTY
-};
-
-enum {
-	MLX5_WQ_USER,
-	MLX5_WQ_KERNEL
-};
-
 struct mlx5_ib_rwq_ind_table {
 	struct ib_rwq_ind_table ib_rwq_ind_tbl;
 	u32			rqtn;
@@ -397,6 +384,22 @@
 	u32                     *in;
 };
 
+struct mlx5_ib_gsi_qp {
+	struct ib_qp *rx_qp;
+	u8 port_num;
+	struct ib_qp_cap cap;
+	struct ib_cq *cq;
+	struct mlx5_ib_gsi_wr *outstanding_wrs;
+	u32 outstanding_pi, outstanding_ci;
+	int num_qps;
+	/* Protects access to the tx_qps. Post send operations synchronize
+	 * with tx_qp creation in setup_qp(). Also protects the
+	 * outstanding_wrs array and indices.
+	 */
+	spinlock_t lock;
+	struct ib_qp **tx_qps;
+};
+
 struct mlx5_ib_qp {
 	struct ib_qp		ibqp;
 	union {
@@ -404,6 +407,7 @@
 		struct mlx5_ib_raw_packet_qp raw_packet_qp;
 		struct mlx5_ib_rss_qp rss_qp;
 		struct mlx5_ib_dct dct;
+		struct mlx5_ib_gsi_qp gsi;
 	};
 	struct mlx5_frag_buf	buf;
 
@@ -417,34 +421,37 @@
 	/* serialize qp state modifications
 	 */
 	struct mutex		mutex;
+	/* cached variant of create_flags from struct ib_qp_init_attr */
 	u32			flags;
 	u8			port;
 	u8			state;
-	int			wq_sig;
-	int			scat_cqe;
 	int			max_inline_data;
 	struct mlx5_bf	        bf;
-	int			has_rq;
+	u8			has_rq:1;
+	u8			is_rss:1;
 
 	/* only for user space QPs. For kernel
 	 * we have it from the bf object
 	 */
 	int			bfregn;
 
-	int			create_type;
-
 	struct list_head	qps_list;
 	struct list_head	cq_recv_list;
 	struct list_head	cq_send_list;
 	struct mlx5_rate_limit	rl;
 	u32                     underlay_qpn;
 	u32			flags_en;
-	/* storage for qp sub type when core qp type is IB_QPT_DRIVER */
-	enum ib_qp_type		qp_sub_type;
+	/*
+	 * IB/core doesn't store low-level QP types, so
+	 * store both MLX and IBTA types in the field below.
+	 * IB_QPT_DRIVER will be break to DCI/DCT subtypes.
+	 */
+	enum ib_qp_type		type;
 	/* A flag to indicate if there's a new counter is configured
 	 * but not take effective
 	 */
 	u32                     counter_pending;
+	u16			gsi_lag_port;
 };
 
 struct mlx5_ib_cq_buf {
@@ -455,24 +462,6 @@
 	int			nent;
 };
 
-enum mlx5_ib_qp_flags {
-	MLX5_IB_QP_LSO                          = IB_QP_CREATE_IPOIB_UD_LSO,
-	MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK     = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
-	MLX5_IB_QP_CROSS_CHANNEL            = IB_QP_CREATE_CROSS_CHANNEL,
-	MLX5_IB_QP_MANAGED_SEND             = IB_QP_CREATE_MANAGED_SEND,
-	MLX5_IB_QP_MANAGED_RECV             = IB_QP_CREATE_MANAGED_RECV,
-	MLX5_IB_QP_SIGNATURE_HANDLING           = 1 << 5,
-	/* QP uses 1 as its source QP number */
-	MLX5_IB_QP_SQPN_QP1			= 1 << 6,
-	MLX5_IB_QP_CAP_SCATTER_FCS		= 1 << 7,
-	MLX5_IB_QP_RSS				= 1 << 8,
-	MLX5_IB_QP_CVLAN_STRIPPING		= 1 << 9,
-	MLX5_IB_QP_UNDERLAY			= 1 << 10,
-	MLX5_IB_QP_PCI_WRITE_END_PADDING	= 1 << 11,
-	MLX5_IB_QP_TUNNEL_OFFLOAD		= 1 << 12,
-	MLX5_IB_QP_PACKET_BASED_CREDIT		= 1 << 13,
-};
-
 struct mlx5_umr_wr {
 	struct ib_send_wr		wr;
 	u64				virt_addr;
@@ -560,6 +549,13 @@
 	MLX5_IB_MTT_WRITE = (1 << 1),
 };
 
+struct mlx5_user_mmap_entry {
+	struct rdma_user_mmap_entry rdma_entry;
+	u8 mmap_flag;
+	u64 address;
+	u32 page_idx;
+};
+
 struct mlx5_ib_dm {
 	struct ib_dm		ibdm;
 	phys_addr_t		dev_addr;
@@ -571,6 +567,7 @@
 		} icm_dm;
 		/* other dm types specific params should be added here */
 	};
+	struct mlx5_user_mmap_entry mentry;
 };
 
 #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
@@ -586,6 +583,9 @@
 					  IB_ACCESS_REMOTE_READ   |\
 					  IB_ZERO_BASED)
 
+#define mlx5_update_odp_stats(mr, counter_name, value)		\
+	atomic64_add(value, &((mr)->odp_stats.counter_name))
+
 struct mlx5_ib_mr {
 	struct ib_mr		ibmr;
 	void			*descs;
@@ -601,13 +601,12 @@
 	struct ib_umem	       *umem;
 	struct mlx5_shared_mr_info	*smr_info;
 	struct list_head	list;
-	int			order;
-	bool			allocated_from_cache;
+	unsigned int		order;
+	struct mlx5_cache_ent  *cache_ent;
 	int			npages;
 	struct mlx5_ib_dev     *dev;
 	u32 out[MLX5_ST_SZ_DW(create_mkey_out)];
 	struct mlx5_core_sig_ctx    *sig;
-	unsigned int		live;
 	void			*descs_alloc;
 	int			access_flags; /* Needed for rereg MR */
 
@@ -619,10 +618,19 @@
 	u64			data_iova;
 	u64			pi_iova;
 
-	atomic_t		num_leaf_free;
-	wait_queue_head_t       q_leaf_free;
+	/* For ODP and implicit */
+	atomic_t		num_deferred_work;
+	wait_queue_head_t       q_deferred_work;
+	struct xarray		implicit_children;
+	union {
+		struct rcu_head rcu;
+		struct list_head elm;
+		struct work_struct work;
+	} odp_destroy;
+	struct ib_odp_counters	odp_stats;
+	bool			is_odp_implicit;
+
 	struct mlx5_async_work  cb_work;
-	atomic_t		num_pending_prefetch;
 };
 
 static inline bool is_odp_mr(struct mlx5_ib_mr *mr)
@@ -657,12 +665,6 @@
 	struct semaphore	sem;
 };
 
-enum {
-	MLX5_FMR_INVALID,
-	MLX5_FMR_VALID,
-	MLX5_FMR_BUSY,
-};
-
 struct mlx5_cache_ent {
 	struct list_head	head;
 	/* sync access to the cahce entry
@@ -676,38 +678,47 @@
 	u32			access_mode;
 	u32			page;
 
-	u32			size;
-	u32                     cur;
+	u8 disabled:1;
+	u8 fill_to_high_water:1;
+
+	/*
+	 * - available_mrs is the length of list head, ie the number of MRs
+	 *   available for immediate allocation.
+	 * - total_mrs is available_mrs plus all in use MRs that could be
+	 *   returned to the cache.
+	 * - limit is the low water mark for available_mrs, 2* limit is the
+	 *   upper water mark.
+	 * - pending is the number of MRs currently being created
+	 */
+	u32 total_mrs;
+	u32 available_mrs;
+	u32 limit;
+	u32 pending;
+
+	/* Statistics */
 	u32                     miss;
-	u32			limit;
 
 	struct mlx5_ib_dev     *dev;
 	struct work_struct	work;
 	struct delayed_work	dwork;
-	int			pending;
-	struct completion	compl;
 };
 
 struct mlx5_mr_cache {
 	struct workqueue_struct *wq;
 	struct mlx5_cache_ent	ent[MAX_MR_CACHE_ENTRIES];
-	int			stopped;
 	struct dentry		*root;
 	unsigned long		last_add;
 };
 
-struct mlx5_ib_gsi_qp;
-
 struct mlx5_ib_port_resources {
-	struct mlx5_ib_resources *devr;
 	struct mlx5_ib_gsi_qp *gsi;
 	struct work_struct pkey_change_work;
 };
 
 struct mlx5_ib_resources {
 	struct ib_cq	*c0;
-	struct ib_xrcd	*x0;
-	struct ib_xrcd	*x1;
+	u32 xrcdn0;
+	u32 xrcdn1;
 	struct ib_pd	*p0;
 	struct ib_srq	*s0;
 	struct ib_srq	*s1;
@@ -723,7 +734,6 @@
 	u32 num_cong_counters;
 	u32 num_ext_ppcnt_counters;
 	u16 set_id;
-	bool set_id_valid;
 };
 
 struct mlx5_ib_multiport_info;
@@ -769,6 +779,7 @@
 	MLX5_IB_DBG_CC_RP_BYTE_RESET,
 	MLX5_IB_DBG_CC_RP_THRESHOLD,
 	MLX5_IB_DBG_CC_RP_AI_RATE,
+	MLX5_IB_DBG_CC_RP_MAX_RATE,
 	MLX5_IB_DBG_CC_RP_HAI_RATE,
 	MLX5_IB_DBG_CC_RP_MIN_DEC_FAC,
 	MLX5_IB_DBG_CC_RP_MIN_RATE,
@@ -778,6 +789,7 @@
 	MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD,
 	MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE,
 	MLX5_IB_DBG_CC_RP_GD,
+	MLX5_IB_DBG_CC_NP_MIN_TIME_BETWEEN_CNPS,
 	MLX5_IB_DBG_CC_NP_CNP_DSCP,
 	MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE,
 	MLX5_IB_DBG_CC_NP_CNP_PRIO,
@@ -793,13 +805,6 @@
 	MLX5_MAX_DELAY_DROP_TIMEOUT_MS = 100,
 };
 
-struct mlx5_ib_dbg_delay_drop {
-	struct dentry		*dir_debugfs;
-	struct dentry		*rqs_cnt_debugfs;
-	struct dentry		*events_cnt_debugfs;
-	struct dentry		*timeout_debugfs;
-};
-
 struct mlx5_ib_delay_drop {
 	struct mlx5_ib_dev     *dev;
 	struct work_struct	delay_drop_work;
@@ -809,15 +814,16 @@
 	bool			activate;
 	atomic_t		events_cnt;
 	atomic_t		rqs_cnt;
-	struct mlx5_ib_dbg_delay_drop *dbg;
+	struct dentry		*dir_debugfs;
 };
 
 enum mlx5_ib_stages {
 	MLX5_IB_STAGE_INIT,
-	MLX5_IB_STAGE_FLOW_DB,
+	MLX5_IB_STAGE_FS,
 	MLX5_IB_STAGE_CAPS,
 	MLX5_IB_STAGE_NON_DEFAULT_CB,
 	MLX5_IB_STAGE_ROCE,
+	MLX5_IB_STAGE_QP,
 	MLX5_IB_STAGE_SRQ,
 	MLX5_IB_STAGE_DEVICE_RESOURCES,
 	MLX5_IB_STAGE_DEVICE_NOTIFIER,
@@ -831,7 +837,7 @@
 	MLX5_IB_STAGE_IB_REG,
 	MLX5_IB_STAGE_POST_IB_REG_UMR,
 	MLX5_IB_STAGE_DELAY_DROP,
-	MLX5_IB_STAGE_CLASS_ATTR,
+	MLX5_IB_STAGE_RESTRACK,
 	MLX5_IB_STAGE_MAX,
 };
 
@@ -950,6 +956,15 @@
 	struct xarray event_xa;
 };
 
+struct mlx5_var_table {
+	/* serialize updating the bitmap */
+	struct mutex bitmap_lock;
+	unsigned long *bitmap;
+	u64 hw_start_addr;
+	u32 stride_size;
+	u64 num_var_hw_entries;
+};
+
 struct mlx5_ib_dev {
 	struct ib_device		ib_dev;
 	struct mlx5_core_dev		*mdev;
@@ -958,16 +973,21 @@
 	/* serialize update of capability mask
 	 */
 	struct mutex			cap_mask_mutex;
-	bool				ib_active;
+	u8				ib_active:1;
+	u8				is_rep:1;
+	u8				lag_active:1;
+	u8				wc_support:1;
+	u8				fill_delay;
 	struct umr_common		umrc;
 	/* sync used page count stats
 	 */
 	struct mlx5_ib_resources	devr;
+
+	atomic_t			mkey_var;
 	struct mlx5_mr_cache		cache;
 	struct timer_list		delay_timer;
 	/* Prevents soft lock on massive reg MRs */
 	struct mutex			slow_path_mutex;
-	int				fill_delay;
 	struct ib_odp_caps	odp_caps;
 	u64			odp_max_size;
 	struct mlx5_ib_pf_eq	odp_pf_eq;
@@ -976,7 +996,9 @@
 	 * Sleepable RCU that prevents destruction of MRs while they are still
 	 * being used by a page fault handler.
 	 */
-	struct srcu_struct      mr_srcu;
+	struct srcu_struct      odp_srcu;
+	struct xarray		odp_mkeys;
+
 	u32			null_mkey;
 	struct mlx5_ib_flow_db	*flow_db;
 	/* protect resources needed as part of reset flow */
@@ -985,11 +1007,10 @@
 	/* Array with num_ports elements */
 	struct mlx5_ib_port	*port;
 	struct mlx5_sq_bfreg	bfreg;
+	struct mlx5_sq_bfreg	wc_bfreg;
 	struct mlx5_sq_bfreg	fp_bfreg;
 	struct mlx5_ib_delay_drop	delay_drop;
 	const struct mlx5_ib_profile	*profile;
-	bool			is_rep;
-	int				lag_active;
 
 	struct mlx5_ib_lb_state		lb;
 	u8			umr_fence;
@@ -998,8 +1019,12 @@
 	struct mlx5_dm		dm;
 	u16			devx_whitelist_uid;
 	struct mlx5_srq_table   srq_table;
+	struct mlx5_qp_table    qp_table;
 	struct mlx5_async_ctx   async_ctx;
 	struct mlx5_devx_event_table devx_event_table;
+	struct mlx5_var_table var_table;
+
+	struct xarray sig_mrs;
 };
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -1040,11 +1065,6 @@
 	return container_of(core_qp, struct mlx5_ib_rwq, core_qp);
 }
 
-static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mkey *mmkey)
-{
-	return container_of(mmkey, struct mlx5_ib_mr, mmkey);
-}
-
 static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd)
 {
 	return container_of(ibpd, struct mlx5_ib_pd, ibpd);
@@ -1096,6 +1116,13 @@
 	return container_of(ibact, struct mlx5_ib_flow_action, ib_action);
 }
 
+static inline struct mlx5_user_mmap_entry *
+to_mmmap(struct rdma_user_mmap_entry *rdma_entry)
+{
+	return container_of(rdma_entry,
+		struct mlx5_user_mmap_entry, rdma_entry);
+}
+
 int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context,
 			struct ib_udata *udata, unsigned long virt,
 			struct mlx5_db *db);
@@ -1103,16 +1130,19 @@
 void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
 void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
 void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index);
-int mlx5_ib_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
+int mlx5_ib_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr,
 		      struct ib_udata *udata);
 int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
-void mlx5_ib_destroy_ah(struct ib_ah *ah, u32 flags);
+static inline int mlx5_ib_destroy_ah(struct ib_ah *ah, u32 flags)
+{
+	return 0;
+}
 int mlx5_ib_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *init_attr,
 		       struct ib_udata *udata);
 int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
 int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr);
-void mlx5_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
+int mlx5_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
 int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 			  const struct ib_recv_wr **bad_wr);
 int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp);
@@ -1127,19 +1157,15 @@
 int mlx5_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata);
 void mlx5_ib_drain_sq(struct ib_qp *qp);
 void mlx5_ib_drain_rq(struct ib_qp *qp);
-int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
-		      const struct ib_send_wr **bad_wr);
-int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
-		      const struct ib_recv_wr **bad_wr);
-int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
-			     int buflen, size_t *bc);
-int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
-			     int buflen, size_t *bc);
-int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index,
-			      void *buffer, int buflen, size_t *bc);
+int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
+			size_t buflen, size_t *bc);
+int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
+			size_t buflen, size_t *bc);
+int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer,
+			 size_t buflen, size_t *bc);
 int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		      struct ib_udata *udata);
-void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
@@ -1154,8 +1180,7 @@
 		      struct ib_sge *sg_list,
 		      u32 num_sge,
 		      struct uverbs_attr_bundle *attrs);
-struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
-			       struct ib_udata *udata);
+int mlx5_ib_alloc_mw(struct ib_mw *mw, struct ib_udata *udata);
 int mlx5_ib_dealloc_mw(struct ib_mw *mw);
 int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 		       int page_shift, int flags);
@@ -1163,12 +1188,13 @@
 					     struct ib_udata *udata,
 					     int access_flags);
 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
+void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr);
 int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 			  u64 length, u64 virt_addr, int access_flags,
 			  struct ib_pd *pd, struct ib_udata *udata);
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			       u32 max_num_sg, struct ib_udata *udata);
+			       u32 max_num_sg);
 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd,
 					 u32 max_num_sg,
 					 u32 max_num_meta_sg);
@@ -1180,11 +1206,9 @@
 			 unsigned int *meta_sg_offset);
 int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 			const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-			const struct ib_mad_hdr *in, size_t in_mad_size,
-			struct ib_mad_hdr *out, size_t *out_mad_size,
-			u16 *out_mad_pkey_index);
-struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
-				   struct ib_udata *udata);
+			const struct ib_mad *in, struct ib_mad *out,
+			size_t *out_mad_size, u16 *out_mad_pkey_index);
+int mlx5_ib_alloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata);
 int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata);
 int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset);
 int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port);
@@ -1206,8 +1230,6 @@
 			    struct ib_port_attr *props);
 int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
 		       struct ib_port_attr *props);
-int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev);
-void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev);
 void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr,
 			unsigned long max_page_shift,
 			int *count, int *shift,
@@ -1222,21 +1244,23 @@
 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
 
-struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry);
+struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
+				       unsigned int entry, int access_flags);
 void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
+int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr);
+
 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
 			    struct ib_mr_status *mr_status);
 struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
 				struct ib_wq_init_attr *init_attr,
 				struct ib_udata *udata);
-void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
+int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
 int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
 		      u32 wq_attr_mask, struct ib_udata *udata);
-struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
-						      struct ib_rwq_ind_table_init_attr *init_attr,
-						      struct ib_udata *udata);
+int mlx5_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_table,
+				 struct ib_rwq_ind_table_init_attr *init_attr,
+				 struct ib_udata *udata);
 int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
-bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev);
 struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
 			       struct ib_ucontext *context,
 			       struct ib_dm_alloc_attr *attr,
@@ -1252,15 +1276,14 @@
 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
 int __init mlx5_ib_odp_init(void);
 void mlx5_ib_odp_cleanup(void);
-void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
-			      unsigned long end);
 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent);
-void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
-			   size_t nentries, struct mlx5_ib_mr *mr, int flags);
+void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+			   struct mlx5_ib_mr *mr, int flags);
 
 int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
 			       enum ib_uverbs_advise_mr_advice advice,
 			       u32 flags, struct ib_sge *sg_list, u32 num_sge);
+int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable);
 #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 {
@@ -1272,9 +1295,8 @@
 static inline int mlx5_ib_odp_init(void) { return 0; }
 static inline void mlx5_ib_odp_cleanup(void)				    {}
 static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {}
-static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
-					 size_t nentries, struct mlx5_ib_mr *mr,
-					 int flags) {}
+static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+					 struct mlx5_ib_mr *mr, int flags) {}
 
 static inline int
 mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
@@ -1283,11 +1305,14 @@
 {
 	return -EOPNOTSUPP;
 }
-static inline void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp,
-					    unsigned long start,
-					    unsigned long end){};
+static inline int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable)
+{
+	return -EOPNOTSUPP;
+}
 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 
+extern const struct mmu_interval_notifier_ops mlx5_mn_ops;
+
 /* Needed for rep profile */
 void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
 		      const struct mlx5_ib_profile *profile,
@@ -1301,19 +1326,22 @@
 			      u8 port, int state);
 int mlx5_ib_get_vf_stats(struct ib_device *device, int vf,
 			 u8 port, struct ifla_vf_stats *stats);
+int mlx5_ib_get_vf_guid(struct ib_device *device, int vf, u8 port,
+			struct ifla_vf_guid *node_guid,
+			struct ifla_vf_guid *port_guid);
 int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port,
 			u64 guid, int type);
 
-__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev,
-			       const struct ib_gid_attr *attr);
+__be16 mlx5_get_roce_udp_sport_min(const struct mlx5_ib_dev *dev,
+				   const struct ib_gid_attr *attr);
 
 void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num);
 void mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num);
 
 /* GSI QP helper functions */
-struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd,
-				    struct ib_qp_init_attr *init_attr);
-int mlx5_ib_gsi_destroy_qp(struct ib_qp *qp);
+int mlx5_ib_create_gsi(struct ib_pd *pd, struct mlx5_ib_qp *mqp,
+		       struct ib_qp_init_attr *attr);
+int mlx5_ib_destroy_gsi(struct mlx5_ib_qp *mqp);
 int mlx5_ib_gsi_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr,
 			  int attr_mask);
 int mlx5_ib_gsi_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
@@ -1336,41 +1364,11 @@
 void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev,
 				  u8 port_num);
 
-#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
-int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user);
-void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid);
-void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev);
-void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev);
-const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void);
 extern const struct uapi_definition mlx5_ib_devx_defs[];
 extern const struct uapi_definition mlx5_ib_flow_defs[];
-struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add(
-	struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher,
-	struct mlx5_flow_context *flow_context,
-	struct mlx5_flow_act *flow_act, u32 counter_id,
-	void *cmd_in, int inlen, int dest_id, int dest_type);
-bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type);
-bool mlx5_ib_devx_is_flow_counter(void *obj, u32 *counter_id);
-int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root);
-void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction);
-#else
-static inline int
-mlx5_ib_devx_create(struct mlx5_ib_dev *dev,
-			   bool is_user) { return -EOPNOTSUPP; }
-static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) {}
-static inline void mlx5_ib_devx_init_event_table(struct mlx5_ib_dev *dev) {}
-static inline void mlx5_ib_devx_cleanup_event_table(struct mlx5_ib_dev *dev) {}
-static inline bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id,
-					     int *dest_type)
-{
-	return false;
-}
-static inline void
-mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction)
-{
-	return;
-};
-#endif
+extern const struct uapi_definition mlx5_ib_qos_defs[];
+extern const struct uapi_definition mlx5_ib_std_types_defs[];
+
 static inline void init_query_mad(struct ib_smp *mad)
 {
 	mad->base_version  = 1;
@@ -1379,18 +1377,9 @@
 	mad->method	   = IB_MGMT_METHOD_GET;
 }
 
-static inline u8 convert_access(int acc)
-{
-	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC       : 0) |
-	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX5_PERM_REMOTE_WRITE : 0) |
-	       (acc & IB_ACCESS_REMOTE_READ   ? MLX5_PERM_REMOTE_READ  : 0) |
-	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX5_PERM_LOCAL_WRITE  : 0) |
-	       MLX5_PERM_LOCAL_READ;
-}
-
 static inline int is_qp1(enum ib_qp_type qp_type)
 {
-	return qp_type == MLX5_IB_QPT_HW_GSI;
+	return qp_type == MLX5_IB_QPT_HW_GSI || qp_type == IB_QPT_GSI;
 }
 
 #define MLX5_MAX_UMR_SHIFT 16
@@ -1428,12 +1417,11 @@
 {
 	u8 cqe_version = ucontext->cqe_version;
 
-	if (field_avail(struct mlx5_ib_create_qp, uidx, inlen) &&
-	    !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
+	if ((offsetofend(typeof(*ucmd), uidx) <= inlen) && !cqe_version &&
+	    (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
 		return 0;
 
-	if (!!(field_avail(struct mlx5_ib_create_qp, uidx, inlen) !=
-	       !!cqe_version))
+	if ((offsetofend(typeof(*ucmd), uidx) <= inlen) != !!cqe_version)
 		return -EINVAL;
 
 	return verify_assign_uidx(cqe_version, ucmd->uidx, user_index);
@@ -1446,12 +1434,11 @@
 {
 	u8 cqe_version = ucontext->cqe_version;
 
-	if (field_avail(struct mlx5_ib_create_srq, uidx, inlen) &&
-	    !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
+	if ((offsetofend(typeof(*ucmd), uidx) <= inlen) && !cqe_version &&
+	    (ucmd->uidx == MLX5_IB_DEFAULT_UIDX))
 		return 0;
 
-	if (!!(field_avail(struct mlx5_ib_create_srq, uidx, inlen) !=
-	       !!cqe_version))
+	if ((offsetofend(typeof(*ucmd), uidx) <= inlen) != !!cqe_version)
 		return -EINVAL;
 
 	return verify_assign_uidx(cqe_version, ucmd->uidx, user_index);
@@ -1476,20 +1463,65 @@
 			struct mlx5_bfreg_info *bfregi, u32 bfregn,
 			bool dyn_bfreg);
 
-int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
-u16 mlx5_ib_get_counters_id(struct mlx5_ib_dev *dev, u8 port_num);
-
-static inline bool mlx5_ib_can_use_umr(struct mlx5_ib_dev *dev,
-				       bool do_modify_atomic)
+static inline bool mlx5_ib_can_load_pas_with_umr(struct mlx5_ib_dev *dev,
+						 size_t length)
 {
+	/*
+	 * umr_check_mkey_mask() rejects MLX5_MKEY_MASK_PAGE_SIZE which is
+	 * always set if MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (aka
+	 * MLX5_IB_UPD_XLT_ADDR and MLX5_IB_UPD_XLT_ENABLE) is set. Thus, a mkey
+	 * can never be enabled without this capability. Simplify this weird
+	 * quirky hardware by just saying it can't use PAS lists with UMR at
+	 * all.
+	 */
 	if (MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
 		return false;
 
-	if (do_modify_atomic &&
+	/*
+	 * length is the size of the MR in bytes when mlx5_ib_update_xlt() is
+	 * used.
+	 */
+	if (!MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
+	    length >= MLX5_MAX_UMR_PAGES * PAGE_SIZE)
+		return false;
+	return true;
+}
+
+/*
+ * true if an existing MR can be reconfigured to new access_flags using UMR.
+ * Older HW cannot use UMR to update certain elements of the MKC. See
+ * umr_check_mkey_mask(), get_umr_update_access_mask() and umr_check_mkey_mask()
+ */
+static inline bool mlx5_ib_can_reconfig_with_umr(struct mlx5_ib_dev *dev,
+						 unsigned int current_access_flags,
+						 unsigned int target_access_flags)
+{
+	unsigned int diffs = current_access_flags ^ target_access_flags;
+
+	if ((diffs & IB_ACCESS_REMOTE_ATOMIC) &&
 	    MLX5_CAP_GEN(dev->mdev, atomic) &&
 	    MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
 		return false;
 
+	if ((diffs & IB_ACCESS_RELAXED_ORDERING) &&
+	    MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) &&
+	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
+		return false;
+
+	if ((diffs & IB_ACCESS_RELAXED_ORDERING) &&
+	    MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) &&
+	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
+		return false;
+
 	return true;
 }
+
+int mlx5_ib_test_wc(struct mlx5_ib_dev *dev);
+
+static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev)
+{
+	return dev->lag_active ||
+		(MLX5_CAP_GEN(dev->mdev, num_lag_ports) > 1 &&
+		 MLX5_CAP_GEN(dev->mdev, lag_tx_port_affinity));
+}
 #endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 99d563d..1934669 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -47,10 +47,69 @@
 
 #define MLX5_UMR_ALIGN 2048
 
+static void
+create_mkey_callback(int status, struct mlx5_async_work *context);
+
+static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
+					  struct ib_pd *pd)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+
+	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
+	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
+	MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
+	MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
+	MLX5_SET(mkc, mkc, lr, 1);
+
+	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write))
+		MLX5_SET(mkc, mkc, relaxed_ordering_write,
+			 !!(acc & IB_ACCESS_RELAXED_ORDERING));
+	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))
+		MLX5_SET(mkc, mkc, relaxed_ordering_read,
+			 !!(acc & IB_ACCESS_RELAXED_ORDERING));
+
+	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
+	MLX5_SET(mkc, mkc, qpn, 0xffffff);
+	MLX5_SET64(mkc, mkc, start_addr, start_addr);
+}
+
+static void
+assign_mkey_variant(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
+		    u32 *in)
+{
+	u8 key = atomic_inc_return(&dev->mkey_var);
+	void *mkc;
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, mkey_7_0, key);
+	mkey->key = key;
+}
+
+static int
+mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey,
+		    u32 *in, int inlen)
+{
+	assign_mkey_variant(dev, mkey, in);
+	return mlx5_core_create_mkey(dev->mdev, mkey, in, inlen);
+}
+
+static int
+mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev,
+		       struct mlx5_core_mkey *mkey,
+		       struct mlx5_async_ctx *async_ctx,
+		       u32 *in, int inlen, u32 *out, int outlen,
+		       struct mlx5_async_work *context)
+{
+	MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY);
+	assign_mkey_variant(dev, mkey, in);
+	return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen,
+				create_mkey_callback, context);
+}
+
 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
 static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
 static int mr_cache_max_order(struct mlx5_ib_dev *dev);
-static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr);
+static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent);
 
 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
 {
@@ -59,85 +118,79 @@
 
 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
-	int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
+	WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)));
 
-	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
-		/* Wait until all page fault handlers using the mr complete. */
-		synchronize_srcu(&dev->mr_srcu);
-
-	return err;
+	return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 }
 
-static int order2idx(struct mlx5_ib_dev *dev, int order)
-{
-	struct mlx5_mr_cache *cache = &dev->cache;
-
-	if (order < cache->ent[0].order)
-		return 0;
-	else
-		return order - cache->ent[0].order;
-}
-
-static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
+static inline bool mlx5_ib_pas_fits_in_mr(struct mlx5_ib_mr *mr, u64 start,
+					  u64 length)
 {
 	return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >=
 		length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
 }
 
-static void reg_mr_callback(int status, struct mlx5_async_work *context)
+static void create_mkey_callback(int status, struct mlx5_async_work *context)
 {
 	struct mlx5_ib_mr *mr =
 		container_of(context, struct mlx5_ib_mr, cb_work);
 	struct mlx5_ib_dev *dev = mr->dev;
-	struct mlx5_mr_cache *cache = &dev->cache;
-	int c = order2idx(dev, mr->order);
-	struct mlx5_cache_ent *ent = &cache->ent[c];
-	u8 key;
+	struct mlx5_cache_ent *ent = mr->cache_ent;
 	unsigned long flags;
-	struct xarray *mkeys = &dev->mdev->priv.mkey_table;
-	int err;
 
-	spin_lock_irqsave(&ent->lock, flags);
-	ent->pending--;
-	spin_unlock_irqrestore(&ent->lock, flags);
 	if (status) {
 		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
 		kfree(mr);
-		dev->fill_delay = 1;
+		spin_lock_irqsave(&ent->lock, flags);
+		ent->pending--;
+		WRITE_ONCE(dev->fill_delay, 1);
+		spin_unlock_irqrestore(&ent->lock, flags);
 		mod_timer(&dev->delay_timer, jiffies + HZ);
 		return;
 	}
 
 	mr->mmkey.type = MLX5_MKEY_MR;
-	spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags);
-	key = dev->mdev->priv.mkey_key++;
-	spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags);
-	mr->mmkey.key = mlx5_idx_to_mkey(MLX5_GET(create_mkey_out, mr->out, mkey_index)) | key;
+	mr->mmkey.key |= mlx5_idx_to_mkey(
+		MLX5_GET(create_mkey_out, mr->out, mkey_index));
 
-	cache->last_add = jiffies;
+	WRITE_ONCE(dev->cache.last_add, jiffies);
 
 	spin_lock_irqsave(&ent->lock, flags);
 	list_add_tail(&mr->list, &ent->head);
-	ent->cur++;
-	ent->size++;
+	ent->available_mrs++;
+	ent->total_mrs++;
+	/* If we are doing fill_to_high_water then keep going. */
+	queue_adjust_cache_locked(ent);
+	ent->pending--;
 	spin_unlock_irqrestore(&ent->lock, flags);
-
-	xa_lock_irqsave(mkeys, flags);
-	err = xa_err(__xa_store(mkeys, mlx5_base_mkey(mr->mmkey.key),
-				&mr->mmkey, GFP_ATOMIC));
-	xa_unlock_irqrestore(mkeys, flags);
-	if (err)
-		pr_err("Error inserting to mkey tree. 0x%x\n", -err);
-
-	if (!completion_done(&ent->compl))
-		complete(&ent->compl);
 }
 
-static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
+static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
-	struct mlx5_cache_ent *ent = &cache->ent[c];
-	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	struct mlx5_ib_mr *mr;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return NULL;
+	mr->order = ent->order;
+	mr->cache_ent = ent;
+	mr->dev = ent->dev;
+
+	set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd);
+	MLX5_SET(mkc, mkc, free, 1);
+	MLX5_SET(mkc, mkc, umr_en, 1);
+	MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
+	MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7);
+
+	MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
+	MLX5_SET(mkc, mkc, log_page_size, ent->page);
+	return mr;
+}
+
+/* Asynchronously schedule new MRs to be populated in the cache. */
+static int add_keys(struct mlx5_cache_ent *ent, unsigned int num)
+{
+	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	struct mlx5_ib_mr *mr;
 	void *mkc;
 	u32 *in;
@@ -150,42 +203,29 @@
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	for (i = 0; i < num; i++) {
-		if (ent->pending >= MAX_PENDING_REG_MR) {
-			err = -EAGAIN;
-			break;
-		}
-
-		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+		mr = alloc_cache_mr(ent, mkc);
 		if (!mr) {
 			err = -ENOMEM;
 			break;
 		}
-		mr->order = ent->order;
-		mr->allocated_from_cache = 1;
-		mr->dev = dev;
-
-		MLX5_SET(mkc, mkc, free, 1);
-		MLX5_SET(mkc, mkc, umr_en, 1);
-		MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3);
-		MLX5_SET(mkc, mkc, access_mode_4_2,
-			 (ent->access_mode >> 2) & 0x7);
-
-		MLX5_SET(mkc, mkc, qpn, 0xffffff);
-		MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt);
-		MLX5_SET(mkc, mkc, log_page_size, ent->page);
-
 		spin_lock_irq(&ent->lock);
+		if (ent->pending >= MAX_PENDING_REG_MR) {
+			err = -EAGAIN;
+			spin_unlock_irq(&ent->lock);
+			kfree(mr);
+			break;
+		}
 		ent->pending++;
 		spin_unlock_irq(&ent->lock);
-		err = mlx5_core_create_mkey_cb(dev->mdev, &mr->mmkey,
-					       &dev->async_ctx, in, inlen,
-					       mr->out, sizeof(mr->out),
-					       reg_mr_callback, &mr->cb_work);
+		err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey,
+					     &ent->dev->async_ctx, in, inlen,
+					     mr->out, sizeof(mr->out),
+					     &mr->cb_work);
 		if (err) {
 			spin_lock_irq(&ent->lock);
 			ent->pending--;
 			spin_unlock_irq(&ent->lock);
-			mlx5_ib_warn(dev, "create mkey failed %d\n", err);
+			mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err);
 			kfree(mr);
 			break;
 		}
@@ -195,35 +235,89 @@
 	return err;
 }
 
-static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
+/* Synchronously create a MR in the cache */
+static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
-	struct mlx5_cache_ent *ent = &cache->ent[c];
-	struct mlx5_ib_mr *tmp_mr;
+	size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
 	struct mlx5_ib_mr *mr;
-	LIST_HEAD(del_list);
-	int i;
+	void *mkc;
+	u32 *in;
+	int err;
 
-	for (i = 0; i < num; i++) {
-		spin_lock_irq(&ent->lock);
-		if (list_empty(&ent->head)) {
-			spin_unlock_irq(&ent->lock);
-			break;
-		}
-		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
-		list_move(&mr->list, &del_list);
-		ent->cur--;
-		ent->size--;
-		spin_unlock_irq(&ent->lock);
-		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
+	in = kzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return ERR_PTR(-ENOMEM);
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+	mr = alloc_cache_mr(ent, mkc);
+	if (!mr) {
+		err = -ENOMEM;
+		goto free_in;
 	}
 
-	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
-		synchronize_srcu(&dev->mr_srcu);
+	err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey, in, inlen);
+	if (err)
+		goto free_mr;
 
-	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
-		list_del(&mr->list);
-		kfree(mr);
+	mr->mmkey.type = MLX5_MKEY_MR;
+	WRITE_ONCE(ent->dev->cache.last_add, jiffies);
+	spin_lock_irq(&ent->lock);
+	ent->total_mrs++;
+	spin_unlock_irq(&ent->lock);
+	kfree(in);
+	return mr;
+free_mr:
+	kfree(mr);
+free_in:
+	kfree(in);
+	return ERR_PTR(err);
+}
+
+static void remove_cache_mr_locked(struct mlx5_cache_ent *ent)
+{
+	struct mlx5_ib_mr *mr;
+
+	lockdep_assert_held(&ent->lock);
+	if (list_empty(&ent->head))
+		return;
+	mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
+	list_del(&mr->list);
+	ent->available_mrs--;
+	ent->total_mrs--;
+	spin_unlock_irq(&ent->lock);
+	mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey);
+	kfree(mr);
+	spin_lock_irq(&ent->lock);
+}
+
+static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target,
+				bool limit_fill)
+{
+	int err;
+
+	lockdep_assert_held(&ent->lock);
+
+	while (true) {
+		if (limit_fill)
+			target = ent->limit * 2;
+		if (target == ent->available_mrs + ent->pending)
+			return 0;
+		if (target > ent->available_mrs + ent->pending) {
+			u32 todo = target - (ent->available_mrs + ent->pending);
+
+			spin_unlock_irq(&ent->lock);
+			err = add_keys(ent, todo);
+			if (err == -EAGAIN)
+				usleep_range(3000, 5000);
+			spin_lock_irq(&ent->lock);
+			if (err) {
+				if (err != -EAGAIN)
+					return err;
+			} else
+				return 0;
+		} else {
+			remove_cache_mr_locked(ent);
+		}
 	}
 }
 
@@ -231,37 +325,38 @@
 			  size_t count, loff_t *pos)
 {
 	struct mlx5_cache_ent *ent = filp->private_data;
-	struct mlx5_ib_dev *dev = ent->dev;
-	char lbuf[20] = {0};
-	u32 var;
+	u32 target;
 	int err;
-	int c;
 
-	count = min(count, sizeof(lbuf) - 1);
-	if (copy_from_user(lbuf, buf, count))
-		return -EFAULT;
+	err = kstrtou32_from_user(buf, count, 0, &target);
+	if (err)
+		return err;
 
-	c = order2idx(dev, ent->order);
-
-	if (sscanf(lbuf, "%u", &var) != 1)
-		return -EINVAL;
-
-	if (var < ent->limit)
-		return -EINVAL;
-
-	if (var > ent->size) {
-		do {
-			err = add_keys(dev, c, var - ent->size);
-			if (err && err != -EAGAIN)
-				return err;
-
-			usleep_range(3000, 5000);
-		} while (err);
-	} else if (var < ent->size) {
-		remove_keys(dev, c, ent->size - var);
+	/*
+	 * Target is the new value of total_mrs the user requests, however we
+	 * cannot free MRs that are in use. Compute the target value for
+	 * available_mrs.
+	 */
+	spin_lock_irq(&ent->lock);
+	if (target < ent->total_mrs - ent->available_mrs) {
+		err = -EINVAL;
+		goto err_unlock;
 	}
+	target = target - (ent->total_mrs - ent->available_mrs);
+	if (target < ent->limit || target > ent->limit*2) {
+		err = -EINVAL;
+		goto err_unlock;
+	}
+	err = resize_available_mrs(ent, target, false);
+	if (err)
+		goto err_unlock;
+	spin_unlock_irq(&ent->lock);
 
 	return count;
+
+err_unlock:
+	spin_unlock_irq(&ent->lock);
+	return err;
 }
 
 static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
@@ -271,7 +366,7 @@
 	char lbuf[20];
 	int err;
 
-	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size);
+	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs);
 	if (err < 0)
 		return err;
 
@@ -289,32 +384,23 @@
 			   size_t count, loff_t *pos)
 {
 	struct mlx5_cache_ent *ent = filp->private_data;
-	struct mlx5_ib_dev *dev = ent->dev;
-	char lbuf[20] = {0};
 	u32 var;
 	int err;
-	int c;
 
-	count = min(count, sizeof(lbuf) - 1);
-	if (copy_from_user(lbuf, buf, count))
-		return -EFAULT;
+	err = kstrtou32_from_user(buf, count, 0, &var);
+	if (err)
+		return err;
 
-	c = order2idx(dev, ent->order);
-
-	if (sscanf(lbuf, "%u", &var) != 1)
-		return -EINVAL;
-
-	if (var > ent->size)
-		return -EINVAL;
-
+	/*
+	 * Upon set we immediately fill the cache to high water mark implied by
+	 * the limit.
+	 */
+	spin_lock_irq(&ent->lock);
 	ent->limit = var;
-
-	if (ent->cur < ent->limit) {
-		err = add_keys(dev, c, 2 * ent->limit - ent->cur);
-		if (err)
-			return err;
-	}
-
+	err = resize_available_mrs(ent, 0, true);
+	spin_unlock_irq(&ent->lock);
+	if (err)
+		return err;
 	return count;
 }
 
@@ -339,68 +425,119 @@
 	.read	= limit_read,
 };
 
-static int someone_adding(struct mlx5_mr_cache *cache)
+static bool someone_adding(struct mlx5_mr_cache *cache)
 {
-	int i;
+	unsigned int i;
 
 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
-		if (cache->ent[i].cur < cache->ent[i].limit)
-			return 1;
-	}
+		struct mlx5_cache_ent *ent = &cache->ent[i];
+		bool ret;
 
-	return 0;
+		spin_lock_irq(&ent->lock);
+		ret = ent->available_mrs < ent->limit;
+		spin_unlock_irq(&ent->lock);
+		if (ret)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * Check if the bucket is outside the high/low water mark and schedule an async
+ * update. The cache refill has hysteresis, once the low water mark is hit it is
+ * refilled up to the high mark.
+ */
+static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent)
+{
+	lockdep_assert_held(&ent->lock);
+
+	if (ent->disabled || READ_ONCE(ent->dev->fill_delay))
+		return;
+	if (ent->available_mrs < ent->limit) {
+		ent->fill_to_high_water = true;
+		queue_work(ent->dev->cache.wq, &ent->work);
+	} else if (ent->fill_to_high_water &&
+		   ent->available_mrs + ent->pending < 2 * ent->limit) {
+		/*
+		 * Once we start populating due to hitting a low water mark
+		 * continue until we pass the high water mark.
+		 */
+		queue_work(ent->dev->cache.wq, &ent->work);
+	} else if (ent->available_mrs == 2 * ent->limit) {
+		ent->fill_to_high_water = false;
+	} else if (ent->available_mrs > 2 * ent->limit) {
+		/* Queue deletion of excess entries */
+		ent->fill_to_high_water = false;
+		if (ent->pending)
+			queue_delayed_work(ent->dev->cache.wq, &ent->dwork,
+					   msecs_to_jiffies(1000));
+		else
+			queue_work(ent->dev->cache.wq, &ent->work);
+	}
 }
 
 static void __cache_work_func(struct mlx5_cache_ent *ent)
 {
 	struct mlx5_ib_dev *dev = ent->dev;
 	struct mlx5_mr_cache *cache = &dev->cache;
-	int i = order2idx(dev, ent->order);
 	int err;
 
-	if (cache->stopped)
-		return;
+	spin_lock_irq(&ent->lock);
+	if (ent->disabled)
+		goto out;
 
-	ent = &dev->cache.ent[i];
-	if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
-		err = add_keys(dev, i, 1);
-		if (ent->cur < 2 * ent->limit) {
-			if (err == -EAGAIN) {
-				mlx5_ib_dbg(dev, "returned eagain, order %d\n",
-					    i + 2);
-				queue_delayed_work(cache->wq, &ent->dwork,
-						   msecs_to_jiffies(3));
-			} else if (err) {
-				mlx5_ib_warn(dev, "command failed order %d, err %d\n",
-					     i + 2, err);
+	if (ent->fill_to_high_water &&
+	    ent->available_mrs + ent->pending < 2 * ent->limit &&
+	    !READ_ONCE(dev->fill_delay)) {
+		spin_unlock_irq(&ent->lock);
+		err = add_keys(ent, 1);
+		spin_lock_irq(&ent->lock);
+		if (ent->disabled)
+			goto out;
+		if (err) {
+			/*
+			 * EAGAIN only happens if pending is positive, so we
+			 * will be rescheduled from reg_mr_callback(). The only
+			 * failure path here is ENOMEM.
+			 */
+			if (err != -EAGAIN) {
+				mlx5_ib_warn(
+					dev,
+					"command failed order %d, err %d\n",
+					ent->order, err);
 				queue_delayed_work(cache->wq, &ent->dwork,
 						   msecs_to_jiffies(1000));
-			} else {
-				queue_work(cache->wq, &ent->work);
 			}
 		}
-	} else if (ent->cur > 2 * ent->limit) {
+	} else if (ent->available_mrs > 2 * ent->limit) {
+		bool need_delay;
+
 		/*
-		 * The remove_keys() logic is performed as garbage collection
-		 * task. Such task is intended to be run when no other active
-		 * processes are running.
+		 * The remove_cache_mr() logic is performed as garbage
+		 * collection task. Such task is intended to be run when no
+		 * other active processes are running.
 		 *
 		 * The need_resched() will return TRUE if there are user tasks
 		 * to be activated in near future.
 		 *
-		 * In such case, we don't execute remove_keys() and postpone
-		 * the garbage collection work to try to run in next cycle,
-		 * in order to free CPU resources to other tasks.
+		 * In such case, we don't execute remove_cache_mr() and postpone
+		 * the garbage collection work to try to run in next cycle, in
+		 * order to free CPU resources to other tasks.
 		 */
-		if (!need_resched() && !someone_adding(cache) &&
-		    time_after(jiffies, cache->last_add + 300 * HZ)) {
-			remove_keys(dev, i, 1);
-			if (ent->cur > ent->limit)
-				queue_work(cache->wq, &ent->work);
-		} else {
+		spin_unlock_irq(&ent->lock);
+		need_delay = need_resched() || someone_adding(cache) ||
+			     !time_after(jiffies,
+					 READ_ONCE(cache->last_add) + 300 * HZ);
+		spin_lock_irq(&ent->lock);
+		if (ent->disabled)
+			goto out;
+		if (need_delay)
 			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
-		}
+		remove_cache_mr_locked(ent);
+		queue_adjust_cache_locked(ent);
 	}
+out:
+	spin_unlock_irq(&ent->lock);
 }
 
 static void delayed_cache_work_func(struct work_struct *work)
@@ -419,117 +556,101 @@
 	__cache_work_func(ent);
 }
 
-struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry)
+/* Allocate a special entry from the cache */
+struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev,
+				       unsigned int entry, int access_flags)
 {
 	struct mlx5_mr_cache *cache = &dev->cache;
 	struct mlx5_cache_ent *ent;
 	struct mlx5_ib_mr *mr;
-	int err;
 
-	if (entry < 0 || entry >= MAX_MR_CACHE_ENTRIES) {
-		mlx5_ib_err(dev, "cache entry %d is out of range\n", entry);
+	if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY ||
+		    entry >= ARRAY_SIZE(cache->ent)))
 		return ERR_PTR(-EINVAL);
-	}
+
+	/* Matches access in alloc_cache_mr() */
+	if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
+		return ERR_PTR(-EOPNOTSUPP);
 
 	ent = &cache->ent[entry];
-	while (1) {
-		spin_lock_irq(&ent->lock);
-		if (list_empty(&ent->head)) {
-			spin_unlock_irq(&ent->lock);
-
-			err = add_keys(dev, entry, 1);
-			if (err && err != -EAGAIN)
-				return ERR_PTR(err);
-
-			wait_for_completion(&ent->compl);
-		} else {
-			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
-					      list);
-			list_del(&mr->list);
-			ent->cur--;
-			spin_unlock_irq(&ent->lock);
-			if (ent->cur < ent->limit)
-				queue_work(cache->wq, &ent->work);
+	spin_lock_irq(&ent->lock);
+	if (list_empty(&ent->head)) {
+		spin_unlock_irq(&ent->lock);
+		mr = create_cache_mr(ent);
+		if (IS_ERR(mr))
 			return mr;
-		}
+	} else {
+		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
+		list_del(&mr->list);
+		ent->available_mrs--;
+		queue_adjust_cache_locked(ent);
+		spin_unlock_irq(&ent->lock);
 	}
+	mr->access_flags = access_flags;
+	return mr;
 }
 
-static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
+/* Return a MR already available in the cache */
+static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_ib_dev *dev = req_ent->dev;
 	struct mlx5_ib_mr *mr = NULL;
-	struct mlx5_cache_ent *ent;
-	int last_umr_cache_entry;
-	int c;
-	int i;
+	struct mlx5_cache_ent *ent = req_ent;
 
-	c = order2idx(dev, order);
-	last_umr_cache_entry = order2idx(dev, mr_cache_max_order(dev));
-	if (c < 0 || c > last_umr_cache_entry) {
-		mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
-		return NULL;
-	}
-
-	for (i = c; i <= last_umr_cache_entry; i++) {
-		ent = &cache->ent[i];
-
-		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
+	/* Try larger MR pools from the cache to satisfy the allocation */
+	for (; ent != &dev->cache.ent[MR_CACHE_LAST_STD_ENTRY + 1]; ent++) {
+		mlx5_ib_dbg(dev, "order %u, cache index %zu\n", ent->order,
+			    ent - dev->cache.ent);
 
 		spin_lock_irq(&ent->lock);
 		if (!list_empty(&ent->head)) {
 			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
 					      list);
 			list_del(&mr->list);
-			ent->cur--;
+			ent->available_mrs--;
+			queue_adjust_cache_locked(ent);
 			spin_unlock_irq(&ent->lock);
-			if (ent->cur < ent->limit)
-				queue_work(cache->wq, &ent->work);
 			break;
 		}
+		queue_adjust_cache_locked(ent);
 		spin_unlock_irq(&ent->lock);
-
-		queue_work(cache->wq, &ent->work);
 	}
 
 	if (!mr)
-		cache->ent[c].miss++;
+		req_ent->miss++;
 
 	return mr;
 }
 
+static void detach_mr_from_cache(struct mlx5_ib_mr *mr)
+{
+	struct mlx5_cache_ent *ent = mr->cache_ent;
+
+	mr->cache_ent = NULL;
+	spin_lock_irq(&ent->lock);
+	ent->total_mrs--;
+	spin_unlock_irq(&ent->lock);
+}
+
 void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
-	struct mlx5_mr_cache *cache = &dev->cache;
-	struct mlx5_cache_ent *ent;
-	int shrink = 0;
-	int c;
+	struct mlx5_cache_ent *ent = mr->cache_ent;
 
-	if (!mr->allocated_from_cache)
+	if (!ent)
 		return;
 
-	c = order2idx(dev, mr->order);
-	WARN_ON(c < 0 || c >= MAX_MR_CACHE_ENTRIES);
-
-	if (unreg_umr(dev, mr)) {
-		mr->allocated_from_cache = false;
+	if (mlx5_mr_cache_invalidate(mr)) {
+		detach_mr_from_cache(mr);
 		destroy_mkey(dev, mr);
-		ent = &cache->ent[c];
-		if (ent->cur < ent->limit)
-			queue_work(cache->wq, &ent->work);
+		kfree(mr);
 		return;
 	}
 
-	ent = &cache->ent[c];
 	spin_lock_irq(&ent->lock);
 	list_add_tail(&mr->list, &ent->head);
-	ent->cur++;
-	if (ent->cur > 2 * ent->limit)
-		shrink = 1;
+	ent->available_mrs++;
+	queue_adjust_cache_locked(ent);
 	spin_unlock_irq(&ent->lock);
-
-	if (shrink)
-		queue_work(cache->wq, &ent->work);
 }
 
 static void clean_keys(struct mlx5_ib_dev *dev, int c)
@@ -549,16 +670,12 @@
 		}
 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
 		list_move(&mr->list, &del_list);
-		ent->cur--;
-		ent->size--;
+		ent->available_mrs--;
+		ent->total_mrs--;
 		spin_unlock_irq(&ent->lock);
 		mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
 	}
 
-#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-	synchronize_srcu(&dev->mr_srcu);
-#endif
-
 	list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
 		list_del(&mr->list);
 		kfree(mr);
@@ -592,7 +709,7 @@
 		dir = debugfs_create_dir(ent->name, cache->root);
 		debugfs_create_file("size", 0600, dir, ent, &size_fops);
 		debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
-		debugfs_create_u32("cur", 0400, dir, &ent->cur);
+		debugfs_create_u32("cur", 0400, dir, &ent->available_mrs);
 		debugfs_create_u32("miss", 0600, dir, &ent->miss);
 	}
 }
@@ -601,7 +718,7 @@
 {
 	struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer);
 
-	dev->fill_delay = 0;
+	WRITE_ONCE(dev->fill_delay, 0);
 }
 
 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
@@ -627,7 +744,6 @@
 		ent->dev = dev;
 		ent->limit = 0;
 
-		init_completion(&ent->compl);
 		INIT_WORK(&ent->work, cache_work_func);
 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
 
@@ -644,12 +760,14 @@
 			   MLX5_IB_UMR_OCTOWORD;
 		ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
 		if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
-		    !dev->is_rep &&
-		    mlx5_core_is_pf(dev->mdev))
+		    !dev->is_rep && mlx5_core_is_pf(dev->mdev) &&
+		    mlx5_ib_can_load_pas_with_umr(dev, 0))
 			ent->limit = dev->mdev->profile->mr_cache[i].limit;
 		else
 			ent->limit = 0;
-		queue_work(cache->wq, &ent->work);
+		spin_lock_irq(&ent->lock);
+		queue_adjust_cache_locked(ent);
+		spin_unlock_irq(&ent->lock);
 	}
 
 	mlx5_mr_cache_debugfs_init(dev);
@@ -659,13 +777,20 @@
 
 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
 {
-	int i;
+	unsigned int i;
 
 	if (!dev->cache.wq)
 		return 0;
 
-	dev->cache.stopped = 1;
-	flush_workqueue(dev->cache.wq);
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		struct mlx5_cache_ent *ent = &dev->cache.ent[i];
+
+		spin_lock_irq(&ent->lock);
+		ent->disabled = true;
+		spin_unlock_irq(&ent->lock);
+		cancel_work_sync(&ent->work);
+		cancel_delayed_work_sync(&ent->dwork);
+	}
 
 	mlx5_mr_cache_debugfs_cleanup(dev);
 	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
@@ -683,7 +808,6 @@
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_ib_mr *mr;
 	void *mkc;
 	u32 *in;
@@ -702,18 +826,10 @@
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 
 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
-	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
-	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
-	MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
-	MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
-	MLX5_SET(mkc, mkc, lr, 1);
-
 	MLX5_SET(mkc, mkc, length64, 1);
-	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
-	MLX5_SET(mkc, mkc, qpn, 0xffffff);
-	MLX5_SET64(mkc, mkc, start_addr, 0);
+	set_mkc_access_pd_addr_fields(mkc, acc, 0, pd);
 
-	err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
+	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
 	if (err)
 		goto err_in;
 
@@ -752,10 +868,9 @@
 	return MLX5_MAX_UMR_SHIFT;
 }
 
-static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
-		       u64 start, u64 length, int access_flags,
-		       struct ib_umem **umem, int *npages, int *page_shift,
-		       int *ncont, int *order)
+static int mr_umem_get(struct mlx5_ib_dev *dev, u64 start, u64 length,
+		       int access_flags, struct ib_umem **umem, int *npages,
+		       int *page_shift, int *ncont, int *order)
 {
 	struct ib_umem *u;
 
@@ -764,7 +879,8 @@
 	if (access_flags & IB_ACCESS_ON_DEMAND) {
 		struct ib_umem_odp *odp;
 
-		odp = ib_umem_odp_get(udata, start, length, access_flags);
+		odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
+				      &mlx5_mn_ops);
 		if (IS_ERR(odp)) {
 			mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
 				    PTR_ERR(odp));
@@ -779,7 +895,7 @@
 		if (order)
 			*order = ilog2(roundup_pow_of_two(*ncont));
 	} else {
-		u = ib_umem_get(udata, start, length, access_flags, 0);
+		u = ib_umem_get(&dev->ib_dev, start, length, access_flags);
 		if (IS_ERR(u)) {
 			mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u));
 			return PTR_ERR(u);
@@ -846,31 +962,42 @@
 	return err;
 }
 
-static struct mlx5_ib_mr *alloc_mr_from_cache(
-				  struct ib_pd *pd, struct ib_umem *umem,
-				  u64 virt_addr, u64 len, int npages,
-				  int page_shift, int order, int access_flags)
+static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev,
+						      unsigned int order)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+
+	if (order < cache->ent[0].order)
+		return &cache->ent[0];
+	order = order - cache->ent[0].order;
+	if (order > MR_CACHE_LAST_STD_ENTRY)
+		return NULL;
+	return &cache->ent[order];
+}
+
+static struct mlx5_ib_mr *
+alloc_mr_from_cache(struct ib_pd *pd, struct ib_umem *umem, u64 virt_addr,
+		    u64 len, int npages, int page_shift, unsigned int order,
+		    int access_flags)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_cache_ent *ent = mr_cache_ent_from_order(dev, order);
 	struct mlx5_ib_mr *mr;
-	int err = 0;
-	int i;
 
-	for (i = 0; i < 1; i++) {
-		mr = alloc_cached_mr(dev, order);
-		if (mr)
-			break;
+	if (!ent)
+		return ERR_PTR(-E2BIG);
 
-		err = add_keys(dev, order2idx(dev, order), 1);
-		if (err && err != -EAGAIN) {
-			mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
-			break;
-		}
+	/* Matches access in alloc_cache_mr() */
+	if (!mlx5_ib_can_reconfig_with_umr(dev, 0, access_flags))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	mr = get_cache_mr(ent);
+	if (!mr) {
+		mr = create_cache_mr(ent);
+		if (IS_ERR(mr))
+			return mr;
 	}
 
-	if (!mr)
-		return ERR_PTR(-EAGAIN);
-
 	mr->ibmr.pd = pd;
 	mr->umem = umem;
 	mr->access_flags = access_flags;
@@ -882,36 +1009,6 @@
 	return mr;
 }
 
-static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages,
-			       void *xlt, int page_shift, size_t size,
-			       int flags)
-{
-	struct mlx5_ib_dev *dev = mr->dev;
-	struct ib_umem *umem = mr->umem;
-
-	if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
-		if (!umr_can_use_indirect_mkey(dev))
-			return -EPERM;
-		mlx5_odp_populate_klm(xlt, idx, npages, mr, flags);
-		return npages;
-	}
-
-	npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx);
-
-	if (!(flags & MLX5_IB_UPD_XLT_ZAP)) {
-		__mlx5_ib_populate_pas(dev, umem, page_shift,
-				       idx, npages, xlt,
-				       MLX5_IB_MTT_PRESENT);
-		/* Clear padding after the pages
-		 * brought from the umem.
-		 */
-		memset(xlt + (npages * sizeof(struct mlx5_mtt)), 0,
-		       size - npages * sizeof(struct mlx5_mtt));
-	}
-
-	return npages;
-}
-
 #define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \
 			    MLX5_UMR_MTT_ALIGNMENT)
 #define MLX5_SPARE_UMR_CHUNK 0x10000
@@ -935,6 +1032,7 @@
 	size_t pages_mapped = 0;
 	size_t pages_to_map = 0;
 	size_t pages_iter = 0;
+	size_t size_to_map = 0;
 	gfp_t gfp;
 	bool use_emergency_page = false;
 
@@ -981,6 +1079,15 @@
 		goto free_xlt;
 	}
 
+	if (mr->umem->is_odp) {
+		if (!(flags & MLX5_IB_UPD_XLT_INDIRECT)) {
+			struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+			size_t max_pages = ib_umem_odp_num_pages(odp) - idx;
+
+			pages_to_map = min_t(size_t, pages_to_map, max_pages);
+		}
+	}
+
 	sg.addr = dma;
 	sg.lkey = dev->umrc.pd->local_dma_lkey;
 
@@ -1003,14 +1110,22 @@
 	     pages_mapped < pages_to_map && !err;
 	     pages_mapped += pages_iter, idx += pages_iter) {
 		npages = min_t(int, pages_iter, pages_to_map - pages_mapped);
+		size_to_map = npages * desc_size;
 		dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
-		npages = populate_xlt(mr, idx, npages, xlt,
-				      page_shift, size, flags);
-
+		if (mr->umem->is_odp) {
+			mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
+		} else {
+			__mlx5_ib_populate_pas(dev, mr->umem, page_shift, idx,
+					       npages, xlt,
+					       MLX5_IB_MTT_PRESENT);
+			/* Clear padding after the pages
+			 * brought from the umem.
+			 */
+			memset(xlt + size_to_map, 0, size - size_to_map);
+		}
 		dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
 
-		sg.length = ALIGN(npages * desc_size,
-				  MLX5_UMR_MTT_ALIGNMENT);
+		sg.length = ALIGN(size_to_map, MLX5_UMR_MTT_ALIGNMENT);
 
 		if (pages_mapped + pages_iter >= pages_to_map) {
 			if (flags & MLX5_IB_UPD_XLT_ENABLE)
@@ -1078,38 +1193,37 @@
 		goto err_1;
 	}
 	pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
-	if (populate && !(access_flags & IB_ACCESS_ON_DEMAND))
+	if (populate) {
+		if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
+			err = -EINVAL;
+			goto err_2;
+		}
 		mlx5_ib_populate_pas(dev, umem, page_shift, pas,
 				     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
+	}
 
 	/* The pg_access bit allows setting the access flags
 	 * in the page list submitted with the command. */
 	MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	set_mkc_access_pd_addr_fields(mkc, access_flags, virt_addr,
+				      populate ? pd : dev->umrc.pd);
 	MLX5_SET(mkc, mkc, free, !populate);
 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
-	MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
-	MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
-	MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
-	MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
-	MLX5_SET(mkc, mkc, lr, 1);
 	MLX5_SET(mkc, mkc, umr_en, 1);
 
-	MLX5_SET64(mkc, mkc, start_addr, virt_addr);
 	MLX5_SET64(mkc, mkc, len, length);
-	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
 	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
 	MLX5_SET(mkc, mkc, translations_octword_size,
 		 get_octo_len(virt_addr, length, page_shift));
 	MLX5_SET(mkc, mkc, log_page_size, page_shift);
-	MLX5_SET(mkc, mkc, qpn, 0xffffff);
 	if (populate) {
 		MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
 			 get_octo_len(virt_addr, length, page_shift));
 	}
 
-	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
+	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
 	if (err) {
 		mlx5_ib_warn(dev, "create mkey failed\n");
 		goto err_2;
@@ -1134,10 +1248,8 @@
 }
 
 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
-			  int npages, u64 length, int access_flags)
+			  u64 length, int access_flags)
 {
-	mr->npages = npages;
-	atomic_add(npages, &dev->mdev->priv.reg_pages);
 	mr->ibmr.lkey = mr->mmkey.key;
 	mr->ibmr.rkey = mr->mmkey.key;
 	mr->ibmr.length = length;
@@ -1149,7 +1261,6 @@
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_ib_mr *mr;
 	void *mkc;
 	u32 *in;
@@ -1169,25 +1280,16 @@
 
 	MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
 	MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
-	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
-	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
-	MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
-	MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
-	MLX5_SET(mkc, mkc, lr, 1);
-
 	MLX5_SET64(mkc, mkc, len, length);
-	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
-	MLX5_SET(mkc, mkc, qpn, 0xffffff);
-	MLX5_SET64(mkc, mkc, start_addr, start_addr);
+	set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd);
 
-	err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
+	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
 	if (err)
 		goto err_in;
 
 	kfree(in);
 
-	mr->umem = NULL;
-	set_mr_fields(dev, mr, 0, length, acc);
+	set_mr_fields(dev, mr, length, acc);
 
 	return &mr->ibmr;
 
@@ -1208,7 +1310,8 @@
 		      struct uverbs_attr_bundle *attrs)
 {
 	if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH &&
-	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE)
+	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
+	    advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
 		return -EOPNOTSUPP;
 
 	return mlx5_ib_advise_mr_prefetch(pd, advice, flags,
@@ -1253,7 +1356,7 @@
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_ib_mr *mr = NULL;
-	bool use_umr;
+	bool xlt_with_umr;
 	struct ib_umem *umem;
 	int page_shift;
 	int npages;
@@ -1267,8 +1370,15 @@
 	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
 		    start, virt_addr, length, access_flags);
 
+	xlt_with_umr = mlx5_ib_can_load_pas_with_umr(dev, length);
+	/* ODP requires xlt update via umr to work. */
+	if (!xlt_with_umr && (access_flags & IB_ACCESS_ON_DEMAND))
+		return ERR_PTR(-EINVAL);
+
 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start &&
 	    length == U64_MAX) {
+		if (virt_addr != start)
+			return ERR_PTR(-EINVAL);
 		if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
 		    !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
 			return ERR_PTR(-EINVAL);
@@ -1279,34 +1389,23 @@
 		return &mr->ibmr;
 	}
 
-	err = mr_umem_get(dev, udata, start, length, access_flags, &umem,
+	err = mr_umem_get(dev, start, length, access_flags, &umem,
 			  &npages, &page_shift, &ncont, &order);
 
 	if (err < 0)
 		return ERR_PTR(err);
 
-	use_umr = mlx5_ib_can_use_umr(dev, true);
-
-	if (order <= mr_cache_max_order(dev) && use_umr) {
+	if (xlt_with_umr) {
 		mr = alloc_mr_from_cache(pd, umem, virt_addr, length, ncont,
 					 page_shift, order, access_flags);
-		if (PTR_ERR(mr) == -EAGAIN) {
-			mlx5_ib_dbg(dev, "cache empty for order %d\n", order);
+		if (IS_ERR(mr))
 			mr = NULL;
-		}
-	} else if (!MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) {
-		if (access_flags & IB_ACCESS_ON_DEMAND) {
-			err = -EINVAL;
-			pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB\n");
-			goto error;
-		}
-		use_umr = false;
 	}
 
 	if (!mr) {
 		mutex_lock(&dev->slow_path_mutex);
 		mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
-				page_shift, access_flags, !use_umr);
+				page_shift, access_flags, !xlt_with_umr);
 		mutex_unlock(&dev->slow_path_mutex);
 	}
 
@@ -1318,17 +1417,20 @@
 	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
 
 	mr->umem = umem;
-	set_mr_fields(dev, mr, npages, length, access_flags);
+	mr->npages = npages;
+	atomic_add(mr->npages, &dev->mdev->priv.reg_pages);
+	set_mr_fields(dev, mr, length, access_flags);
 
-	if (use_umr) {
+	if (xlt_with_umr && !(access_flags & IB_ACCESS_ON_DEMAND)) {
+		/*
+		 * If the MR was created with reg_create then it will be
+		 * configured properly but left disabled. It is safe to go ahead
+		 * and configure it again via UMR while enabling it.
+		 */
 		int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
 
-		if (access_flags & IB_ACCESS_ON_DEMAND)
-			update_xlt_flags |= MLX5_IB_UPD_XLT_ZAP;
-
 		err = mlx5_ib_update_xlt(mr, 0, ncont, page_shift,
 					 update_xlt_flags);
-
 		if (err) {
 			dereg_mr(dev, mr);
 			return ERR_PTR(err);
@@ -1337,10 +1439,22 @@
 
 	if (is_odp_mr(mr)) {
 		to_ib_umem_odp(mr->umem)->private = mr;
-		atomic_set(&mr->num_pending_prefetch, 0);
+		init_waitqueue_head(&mr->q_deferred_work);
+		atomic_set(&mr->num_deferred_work, 0);
+		err = xa_err(xa_store(&dev->odp_mkeys,
+				      mlx5_base_mkey(mr->mmkey.key), &mr->mmkey,
+				      GFP_KERNEL));
+		if (err) {
+			dereg_mr(dev, mr);
+			return ERR_PTR(err);
+		}
+
+		err = mlx5_ib_init_odp_mr(mr, xlt_with_umr);
+		if (err) {
+			dereg_mr(dev, mr);
+			return ERR_PTR(err);
+		}
 	}
-	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
-		smp_store_release(&mr->live, 1);
 
 	return &mr->ibmr;
 error:
@@ -1348,22 +1462,29 @@
 	return ERR_PTR(err);
 }
 
-static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+/**
+ * mlx5_mr_cache_invalidate - Fence all DMA on the MR
+ * @mr: The MR to fence
+ *
+ * Upon return the NIC will not be doing any DMA to the pages under the MR,
+ * and any DMA inprogress will be completed. Failure of this function
+ * indicates the HW has failed catastrophically.
+ */
+int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr)
 {
-	struct mlx5_core_dev *mdev = dev->mdev;
 	struct mlx5_umr_wr umrwr = {};
 
-	if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+	if (mr->dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
 		return 0;
 
 	umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR |
 			      MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS;
 	umrwr.wr.opcode = MLX5_IB_WR_UMR;
-	umrwr.pd = dev->umrc.pd;
+	umrwr.pd = mr->dev->umrc.pd;
 	umrwr.mkey = mr->mmkey.key;
 	umrwr.ignore_free_state = 1;
 
-	return mlx5_ib_post_send_wait(dev, &umrwr);
+	return mlx5_ib_post_send_wait(mr->dev, &umrwr);
 }
 
 static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr,
@@ -1410,8 +1531,6 @@
 	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
 		    start, virt_addr, length, access_flags);
 
-	atomic_sub(mr->npages, &dev->mdev->priv.reg_pages);
-
 	if (!mr->umem)
 		return -EINVAL;
 
@@ -1432,24 +1551,30 @@
 		 * used.
 		 */
 		flags |= IB_MR_REREG_TRANS;
+		atomic_sub(mr->npages, &dev->mdev->priv.reg_pages);
+		mr->npages = 0;
 		ib_umem_release(mr->umem);
 		mr->umem = NULL;
-		err = mr_umem_get(dev, udata, addr, len, access_flags,
-				  &mr->umem, &npages, &page_shift, &ncont,
-				  &order);
+
+		err = mr_umem_get(dev, addr, len, access_flags, &mr->umem,
+				  &npages, &page_shift, &ncont, &order);
 		if (err)
 			goto err;
+		mr->npages = ncont;
+		atomic_add(mr->npages, &dev->mdev->priv.reg_pages);
 	}
 
-	if (!mlx5_ib_can_use_umr(dev, true) ||
-	    (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len))) {
+	if (!mlx5_ib_can_reconfig_with_umr(dev, mr->access_flags,
+					   access_flags) ||
+	    !mlx5_ib_can_load_pas_with_umr(dev, len) ||
+	    (flags & IB_MR_REREG_TRANS &&
+	     !mlx5_ib_pas_fits_in_mr(mr, addr, len))) {
 		/*
 		 * UMR can't be used - MKey needs to be replaced.
 		 */
-		if (mr->allocated_from_cache)
-			err = unreg_umr(dev, mr);
-		else
-			err = destroy_mkey(dev, mr);
+		if (mr->cache_ent)
+			detach_mr_from_cache(mr);
+		err = destroy_mkey(dev, mr);
 		if (err)
 			goto err;
 
@@ -1461,8 +1586,6 @@
 			mr = to_mmr(ib_mr);
 			goto err;
 		}
-
-		mr->allocated_from_cache = 0;
 	} else {
 		/*
 		 * Send a UMR WQE
@@ -1489,7 +1612,7 @@
 			goto err;
 	}
 
-	set_mr_fields(dev, mr, npages, len, access_flags);
+	set_mr_fields(dev, mr, len, access_flags);
 
 	return 0;
 
@@ -1549,8 +1672,6 @@
 
 static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
 {
-	int allocated_from_cache = mr->allocated_from_cache;
-
 	if (mr->sig) {
 		if (mlx5_core_destroy_psv(dev->mdev,
 					  mr->sig->psv_memory.psv_idx))
@@ -1560,11 +1681,12 @@
 					  mr->sig->psv_wire.psv_idx))
 			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
 				     mr->sig->psv_wire.psv_idx);
+		xa_erase(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key));
 		kfree(mr->sig);
 		mr->sig = NULL;
 	}
 
-	if (!allocated_from_cache) {
+	if (!mr->cache_ent) {
 		destroy_mkey(dev, mr);
 		mlx5_free_priv_descs(mr);
 	}
@@ -1575,54 +1697,20 @@
 	int npages = mr->npages;
 	struct ib_umem *umem = mr->umem;
 
-	if (is_odp_mr(mr)) {
-		struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem);
+	/* Stop all DMA */
+	if (is_odp_mr(mr))
+		mlx5_ib_fence_odp_mr(mr);
+	else
+		clean_mr(dev, mr);
 
-		/* Prevent new page faults and
-		 * prefetch requests from succeeding
-		 */
-		WRITE_ONCE(mr->live, 0);
-
-		/* Wait for all running page-fault handlers to finish. */
-		synchronize_srcu(&dev->mr_srcu);
-
-		/* dequeue pending prefetch requests for the mr */
-		if (atomic_read(&mr->num_pending_prefetch))
-			flush_workqueue(system_unbound_wq);
-		WARN_ON(atomic_read(&mr->num_pending_prefetch));
-
-		/* Destroy all page mappings */
-		if (!umem_odp->is_implicit_odp)
-			mlx5_ib_invalidate_range(umem_odp,
-						 ib_umem_start(umem_odp),
-						 ib_umem_end(umem_odp));
-		else
-			mlx5_ib_free_implicit_mr(mr);
-		/*
-		 * We kill the umem before the MR for ODP,
-		 * so that there will not be any invalidations in
-		 * flight, looking at the *mr struct.
-		 */
-		ib_umem_odp_release(umem_odp);
-		atomic_sub(npages, &dev->mdev->priv.reg_pages);
-
-		/* Avoid double-freeing the umem. */
-		umem = NULL;
-	}
-
-	clean_mr(dev, mr);
-
-	/*
-	 * We should unregister the DMA address from the HCA before
-	 * remove the DMA mapping.
-	 */
-	mlx5_mr_cache_free(dev, mr);
-	ib_umem_release(umem);
-	if (umem)
-		atomic_sub(npages, &dev->mdev->priv.reg_pages);
-
-	if (!mr->allocated_from_cache)
+	if (mr->cache_ent)
+		mlx5_mr_cache_free(dev, mr);
+	else
 		kfree(mr);
+
+	ib_umem_release(umem);
+	atomic_sub(npages, &dev->mdev->priv.reg_pages);
+
 }
 
 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
@@ -1634,6 +1722,11 @@
 		dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr);
 	}
 
+	if (is_odp_mr(mmr) && to_ib_umem_odp(mmr->umem)->is_implicit_odp) {
+		mlx5_ib_free_implicit_mr(mmr);
+		return 0;
+	}
+
 	dereg_mr(to_mdev(ibmr->device), mmr);
 
 	return 0;
@@ -1646,9 +1739,9 @@
 
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 
+	/* This is only used from the kernel, so setting the PD is OK. */
+	set_mkc_access_pd_addr_fields(mkc, 0, 0, pd);
 	MLX5_SET(mkc, mkc, free, 1);
-	MLX5_SET(mkc, mkc, qpn, 0xffffff);
-	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
 	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
 	MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
 	MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
@@ -1673,7 +1766,7 @@
 
 	mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift);
 
-	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
+	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
 	if (err)
 		goto err_free_descs;
 
@@ -1797,8 +1890,15 @@
 	if (err)
 		goto err_free_mtt_mr;
 
+	err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key),
+			      mr->sig, GFP_KERNEL));
+	if (err)
+		goto err_free_descs;
 	return 0;
 
+err_free_descs:
+	destroy_mkey(dev, mr);
+	mlx5_free_priv_descs(mr);
 err_free_mtt_mr:
 	dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr);
 	mr->mtt_mr = NULL;
@@ -1873,7 +1973,7 @@
 }
 
 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			       u32 max_num_sg, struct ib_udata *udata)
+			       u32 max_num_sg)
 {
 	return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0);
 }
@@ -1885,12 +1985,11 @@
 				  max_num_meta_sg);
 }
 
-struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
-			       struct ib_udata *udata)
+int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata)
 {
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_dev *dev = to_mdev(ibmw->device);
 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-	struct mlx5_ib_mw *mw = NULL;
+	struct mlx5_ib_mw *mw = to_mmw(ibmw);
 	u32 *in = NULL;
 	void *mkc;
 	int ndescs;
@@ -1903,21 +2002,20 @@
 
 	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
 	if (err)
-		return ERR_PTR(err);
+		return err;
 
 	if (req.comp_mask || req.reserved1 || req.reserved2)
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
 	if (udata->inlen > sizeof(req) &&
 	    !ib_is_udata_cleared(udata, sizeof(req),
 				 udata->inlen - sizeof(req)))
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
 	ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
 
-	mw = kzalloc(sizeof(*mw), GFP_KERNEL);
 	in = kzalloc(inlen, GFP_KERNEL);
-	if (!mw || !in) {
+	if (!in) {
 		err = -ENOMEM;
 		goto free;
 	}
@@ -1926,61 +2024,62 @@
 
 	MLX5_SET(mkc, mkc, free, 1);
 	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
-	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
+	MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn);
 	MLX5_SET(mkc, mkc, umr_en, 1);
 	MLX5_SET(mkc, mkc, lr, 1);
 	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS);
-	MLX5_SET(mkc, mkc, en_rinval, !!((type == IB_MW_TYPE_2)));
+	MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2)));
 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
 
-	err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, inlen);
+	err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen);
 	if (err)
 		goto free;
 
 	mw->mmkey.type = MLX5_MKEY_MW;
-	mw->ibmw.rkey = mw->mmkey.key;
+	ibmw->rkey = mw->mmkey.key;
 	mw->ndescs = ndescs;
 
-	resp.response_length = min(offsetof(typeof(resp), response_length) +
-				   sizeof(resp.response_length), udata->outlen);
+	resp.response_length =
+		min(offsetofend(typeof(resp), response_length), udata->outlen);
 	if (resp.response_length) {
 		err = ib_copy_to_udata(udata, &resp, resp.response_length);
-		if (err) {
-			mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
-			goto free;
-		}
+		if (err)
+			goto free_mkey;
+	}
+
+	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+		err = xa_err(xa_store(&dev->odp_mkeys,
+				      mlx5_base_mkey(mw->mmkey.key), &mw->mmkey,
+				      GFP_KERNEL));
+		if (err)
+			goto free_mkey;
 	}
 
 	kfree(in);
-	return &mw->ibmw;
+	return 0;
 
+free_mkey:
+	mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
 free:
-	kfree(mw);
 	kfree(in);
-	return ERR_PTR(err);
+	return err;
 }
 
 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
 {
 	struct mlx5_ib_dev *dev = to_mdev(mw->device);
 	struct mlx5_ib_mw *mmw = to_mmw(mw);
-	int err;
 
 	if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
-		xa_erase_irq(&dev->mdev->priv.mkey_table,
-			     mlx5_base_mkey(mmw->mmkey.key));
+		xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key));
 		/*
 		 * pagefault_single_data_segment() may be accessing mmw under
 		 * SRCU if the user bound an ODP MR to this MW.
 		 */
-		synchronize_srcu(&dev->mr_srcu);
+		synchronize_srcu(&dev->odp_srcu);
 	}
 
-	err = mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
-	if (err)
-		return err;
-	kfree(mmw);
-	return 0;
+	return mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey);
 }
 
 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 3f9478d..5c853ec 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -36,6 +36,7 @@
 
 #include "mlx5_ib.h"
 #include "cmd.h"
+#include "qp.h"
 
 #include <linux/mlx5/eq.h>
 
@@ -93,182 +94,224 @@
 
 static u64 mlx5_imr_ksm_entries;
 
-static int check_parent(struct ib_umem_odp *odp,
-			       struct mlx5_ib_mr *parent)
+static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
+			struct mlx5_ib_mr *imr, int flags)
 {
-	struct mlx5_ib_mr *mr = odp->private;
-
-	return mr && mr->parent == parent && !odp->dying;
-}
-
-static struct ib_ucontext_per_mm *mr_to_per_mm(struct mlx5_ib_mr *mr)
-{
-	if (WARN_ON(!mr || !is_odp_mr(mr)))
-		return NULL;
-
-	return to_ib_umem_odp(mr->umem)->per_mm;
-}
-
-static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp)
-{
-	struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent;
-	struct ib_ucontext_per_mm *per_mm = odp->per_mm;
-	struct rb_node *rb;
-
-	down_read(&per_mm->umem_rwsem);
-	while (1) {
-		rb = rb_next(&odp->interval_tree.rb);
-		if (!rb)
-			goto not_found;
-		odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
-		if (check_parent(odp, parent))
-			goto end;
-	}
-not_found:
-	odp = NULL;
-end:
-	up_read(&per_mm->umem_rwsem);
-	return odp;
-}
-
-static struct ib_umem_odp *odp_lookup(u64 start, u64 length,
-				      struct mlx5_ib_mr *parent)
-{
-	struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(parent);
-	struct ib_umem_odp *odp;
-	struct rb_node *rb;
-
-	down_read(&per_mm->umem_rwsem);
-	odp = rbt_ib_umem_lookup(&per_mm->umem_tree, start, length);
-	if (!odp)
-		goto end;
-
-	while (1) {
-		if (check_parent(odp, parent))
-			goto end;
-		rb = rb_next(&odp->interval_tree.rb);
-		if (!rb)
-			goto not_found;
-		odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb);
-		if (ib_umem_start(odp) > start + length)
-			goto not_found;
-	}
-not_found:
-	odp = NULL;
-end:
-	up_read(&per_mm->umem_rwsem);
-	return odp;
-}
-
-void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset,
-			   size_t nentries, struct mlx5_ib_mr *mr, int flags)
-{
-	struct ib_pd *pd = mr->ibmr.pd;
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	struct ib_umem_odp *odp;
-	unsigned long va;
-	int i;
+	struct mlx5_klm *end = pklm + nentries;
 
 	if (flags & MLX5_IB_UPD_XLT_ZAP) {
-		for (i = 0; i < nentries; i++, pklm++) {
+		for (; pklm != end; pklm++, idx++) {
 			pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
-			pklm->key = cpu_to_be32(dev->null_mkey);
+			pklm->key = cpu_to_be32(imr->dev->null_mkey);
 			pklm->va = 0;
 		}
 		return;
 	}
 
 	/*
-	 * The locking here is pretty subtle. Ideally the implicit children
-	 * list would be protected by the umem_mutex, however that is not
+	 * The locking here is pretty subtle. Ideally the implicit_children
+	 * xarray would be protected by the umem_mutex, however that is not
 	 * possible. Instead this uses a weaker update-then-lock pattern:
 	 *
 	 *  srcu_read_lock()
-	 *    <change children list>
+	 *    xa_store()
 	 *    mutex_lock(umem_mutex)
 	 *     mlx5_ib_update_xlt()
 	 *    mutex_unlock(umem_mutex)
 	 *    destroy lkey
 	 *
-	 * ie any change the children list must be followed by the locked
-	 * update_xlt before destroying.
+	 * ie any change the xarray must be followed by the locked update_xlt
+	 * before destroying.
 	 *
 	 * The umem_mutex provides the acquire/release semantic needed to make
-	 * the children list visible to a racing thread. While SRCU is not
+	 * the xa_store() visible to a racing thread. While SRCU is not
 	 * technically required, using it gives consistent use of the SRCU
-	 * locking around the children list.
+	 * locking around the xarray.
 	 */
-	lockdep_assert_held(&to_ib_umem_odp(mr->umem)->umem_mutex);
-	lockdep_assert_held(&mr->dev->mr_srcu);
+	lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex);
+	lockdep_assert_held(&imr->dev->odp_srcu);
 
-	odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE,
-			 nentries * MLX5_IMR_MTT_SIZE, mr);
+	for (; pklm != end; pklm++, idx++) {
+		struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx);
 
-	for (i = 0; i < nentries; i++, pklm++) {
 		pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE);
-		va = (offset + i) * MLX5_IMR_MTT_SIZE;
-		if (odp && ib_umem_start(odp) == va) {
-			struct mlx5_ib_mr *mtt = odp->private;
-
+		if (mtt) {
 			pklm->key = cpu_to_be32(mtt->ibmr.lkey);
-			odp = odp_next(odp);
+			pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE);
 		} else {
-			pklm->key = cpu_to_be32(dev->null_mkey);
+			pklm->key = cpu_to_be32(imr->dev->null_mkey);
+			pklm->va = 0;
 		}
-		mlx5_ib_dbg(dev, "[%d] va %lx key %x\n",
-			    i, va, be32_to_cpu(pklm->key));
 	}
 }
 
-static void mr_leaf_free_action(struct work_struct *work)
+static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
 {
-	struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work);
-	int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
-	struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent;
+	u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
+
+	if (umem_dma & ODP_READ_ALLOWED_BIT)
+		mtt_entry |= MLX5_IB_MTT_READ;
+	if (umem_dma & ODP_WRITE_ALLOWED_BIT)
+		mtt_entry |= MLX5_IB_MTT_WRITE;
+
+	return mtt_entry;
+}
+
+static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
+			 struct mlx5_ib_mr *mr, int flags)
+{
+	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+	dma_addr_t pa;
+	size_t i;
+
+	if (flags & MLX5_IB_UPD_XLT_ZAP)
+		return;
+
+	for (i = 0; i < nentries; i++) {
+		pa = odp->dma_list[idx + i];
+		pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
+	}
+}
+
+void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
+			   struct mlx5_ib_mr *mr, int flags)
+{
+	if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
+		populate_klm(xlt, idx, nentries, mr, flags);
+	} else {
+		populate_mtt(xlt, idx, nentries, mr, flags);
+	}
+}
+
+static void dma_fence_odp_mr(struct mlx5_ib_mr *mr)
+{
+	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+
+	/* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */
+	mutex_lock(&odp->umem_mutex);
+	if (odp->npages) {
+		mlx5_mr_cache_invalidate(mr);
+		ib_umem_odp_unmap_dma_pages(odp, ib_umem_start(odp),
+					    ib_umem_end(odp));
+		WARN_ON(odp->npages);
+	}
+	odp->private = NULL;
+	mutex_unlock(&odp->umem_mutex);
+
+	if (!mr->cache_ent) {
+		mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey);
+		WARN_ON(mr->descs);
+	}
+}
+
+/*
+ * This must be called after the mr has been removed from implicit_children
+ * and the SRCU synchronized.  NOTE: The MR does not necessarily have to be
+ * empty here, parallel page faults could have raced with the free process and
+ * added pages to it.
+ */
+static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
+{
+	struct mlx5_ib_mr *imr = mr->parent;
 	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
+	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+	unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
 	int srcu_key;
 
-	mr->parent = NULL;
-	synchronize_srcu(&mr->dev->mr_srcu);
+	/* implicit_child_mr's are not allowed to have deferred work */
+	WARN_ON(atomic_read(&mr->num_deferred_work));
 
-	if (smp_load_acquire(&imr->live)) {
-		srcu_key = srcu_read_lock(&mr->dev->mr_srcu);
+	if (need_imr_xlt) {
+		srcu_key = srcu_read_lock(&mr->dev->odp_srcu);
 		mutex_lock(&odp_imr->umem_mutex);
-		mlx5_ib_update_xlt(imr, idx, 1, 0,
+		mlx5_ib_update_xlt(mr->parent, idx, 1, 0,
 				   MLX5_IB_UPD_XLT_INDIRECT |
 				   MLX5_IB_UPD_XLT_ATOMIC);
 		mutex_unlock(&odp_imr->umem_mutex);
-		srcu_read_unlock(&mr->dev->mr_srcu, srcu_key);
+		srcu_read_unlock(&mr->dev->odp_srcu, srcu_key);
 	}
-	ib_umem_odp_release(odp);
-	mlx5_mr_cache_free(mr->dev, mr);
 
-	if (atomic_dec_and_test(&imr->num_leaf_free))
-		wake_up(&imr->q_leaf_free);
+	dma_fence_odp_mr(mr);
+
+	mr->parent = NULL;
+	mlx5_mr_cache_free(mr->dev, mr);
+	ib_umem_odp_release(odp);
+	if (atomic_dec_and_test(&imr->num_deferred_work))
+		wake_up(&imr->q_deferred_work);
 }
 
-void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
-			      unsigned long end)
+static void free_implicit_child_mr_work(struct work_struct *work)
 {
+	struct mlx5_ib_mr *mr =
+		container_of(work, struct mlx5_ib_mr, odp_destroy.work);
+
+	free_implicit_child_mr(mr, true);
+}
+
+static void free_implicit_child_mr_rcu(struct rcu_head *head)
+{
+	struct mlx5_ib_mr *mr =
+		container_of(head, struct mlx5_ib_mr, odp_destroy.rcu);
+
+	/* Freeing a MR is a sleeping operation, so bounce to a work queue */
+	INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
+	queue_work(system_unbound_wq, &mr->odp_destroy.work);
+}
+
+static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
+{
+	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+	unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
+	struct mlx5_ib_mr *imr = mr->parent;
+
+	xa_lock(&imr->implicit_children);
+	/*
+	 * This can race with mlx5_ib_free_implicit_mr(), the first one to
+	 * reach the xa lock wins the race and destroys the MR.
+	 */
+	if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) !=
+	    mr)
+		goto out_unlock;
+
+	atomic_inc(&imr->num_deferred_work);
+	call_srcu(&mr->dev->odp_srcu, &mr->odp_destroy.rcu,
+		  free_implicit_child_mr_rcu);
+
+out_unlock:
+	xa_unlock(&imr->implicit_children);
+}
+
+static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
+				     const struct mmu_notifier_range *range,
+				     unsigned long cur_seq)
+{
+	struct ib_umem_odp *umem_odp =
+		container_of(mni, struct ib_umem_odp, notifier);
 	struct mlx5_ib_mr *mr;
 	const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT /
 				    sizeof(struct mlx5_mtt)) - 1;
 	u64 idx = 0, blk_start_idx = 0;
+	u64 invalidations = 0;
+	unsigned long start;
+	unsigned long end;
 	int in_block = 0;
 	u64 addr;
 
-	if (!umem_odp) {
-		pr_err("invalidation called on NULL umem or non-ODP umem\n");
-		return;
-	}
+	if (!mmu_notifier_range_blockable(range))
+		return false;
 
+	mutex_lock(&umem_odp->umem_mutex);
+	mmu_interval_set_seq(mni, cur_seq);
+	/*
+	 * If npages is zero then umem_odp->private may not be setup yet. This
+	 * does not complete until after the first page is mapped for DMA.
+	 */
+	if (!umem_odp->npages)
+		goto out;
 	mr = umem_odp->private;
 
-	if (!mr || !mr->ibmr.pd)
-		return;
-
-	start = max_t(u64, ib_umem_start(umem_odp), start);
-	end = min_t(u64, ib_umem_end(umem_odp), end);
+	start = max_t(u64, ib_umem_start(umem_odp), range->start);
+	end = min_t(u64, ib_umem_end(umem_odp), range->end);
 
 	/*
 	 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
@@ -276,7 +319,6 @@
 	 * overwrite the same MTTs.  Concurent invalidations might race us,
 	 * but they will write 0s as well, so no difference in the end result.
 	 */
-	mutex_lock(&umem_odp->umem_mutex);
 	for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) {
 		idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
 		/*
@@ -291,6 +333,9 @@
 				blk_start_idx = idx;
 				in_block = 1;
 			}
+
+			/* Count page invalidations */
+			invalidations += idx - blk_start_idx + 1;
 		} else {
 			u64 umr_offset = idx & umr_block_mask;
 
@@ -308,6 +353,9 @@
 				   idx - blk_start_idx + 1, 0,
 				   MLX5_IB_UPD_XLT_ZAP |
 				   MLX5_IB_UPD_XLT_ATOMIC);
+
+	mlx5_update_odp_stats(mr, invalidations, invalidations);
+
 	/*
 	 * We are now sure that the device will not access the
 	 * memory. We can safely unmap it, and mark it as dirty if
@@ -316,16 +364,17 @@
 
 	ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
 
-	if (unlikely(!umem_odp->npages && mr->parent &&
-		     !umem_odp->dying)) {
-		WRITE_ONCE(mr->live, 0);
-		umem_odp->dying = 1;
-		atomic_inc(&mr->parent->num_leaf_free);
-		schedule_work(&umem_odp->work);
-	}
+	if (unlikely(!umem_odp->npages && mr->parent))
+		destroy_unused_implicit_child_mr(mr);
+out:
 	mutex_unlock(&umem_odp->umem_mutex);
+	return true;
 }
 
+const struct mmu_interval_notifier_ops mlx5_mn_ops = {
+	.invalidate = mlx5_ib_invalidate_range,
+};
+
 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev)
 {
 	struct ib_odp_caps *caps = &dev->odp_caps;
@@ -333,7 +382,7 @@
 	memset(caps, 0, sizeof(*caps));
 
 	if (!MLX5_CAP_GEN(dev->mdev, pg) ||
-	    !mlx5_ib_can_use_umr(dev, true))
+	    !mlx5_ib_can_load_pas_with_umr(dev, 0))
 		return;
 
 	caps->general_caps = IB_ODP_SUPPORT;
@@ -390,8 +439,6 @@
 	    MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset) &&
 	    !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled))
 		caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT;
-
-	return;
 }
 
 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev,
@@ -400,8 +447,7 @@
 {
 	int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ?
 		     pfault->wqe.wq_num : pfault->token;
-	u32 out[MLX5_ST_SZ_DW(page_fault_resume_out)] = { };
-	u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)]   = { };
+	u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {};
 	int err;
 
 	MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME);
@@ -410,324 +456,395 @@
 	MLX5_SET(page_fault_resume_in, in, wq_number, wq_num);
 	MLX5_SET(page_fault_resume_in, in, error, !!error);
 
-	err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+	err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in);
 	if (err)
 		mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x err %d\n",
 			    wq_num, err);
 }
 
-static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd,
-					    struct ib_umem_odp *umem_odp,
-					    bool ksm, int access_flags)
+static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
+						unsigned long idx)
 {
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct ib_umem_odp *odp;
 	struct mlx5_ib_mr *mr;
+	struct mlx5_ib_mr *ret;
 	int err;
 
-	mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY :
-					    MLX5_IMR_MTT_CACHE_ENTRY);
+	odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem),
+				      idx * MLX5_IMR_MTT_SIZE,
+				      MLX5_IMR_MTT_SIZE, &mlx5_mn_ops);
+	if (IS_ERR(odp))
+		return ERR_CAST(odp);
 
+	ret = mr = mlx5_mr_cache_alloc(imr->dev, MLX5_IMR_MTT_CACHE_ENTRY,
+				       imr->access_flags);
 	if (IS_ERR(mr))
-		return mr;
+		goto out_umem;
 
-	mr->ibmr.pd = pd;
-
-	mr->dev = dev;
-	mr->access_flags = access_flags;
-	mr->mmkey.iova = 0;
-	mr->umem = &umem_odp->umem;
-
-	if (ksm) {
-		err = mlx5_ib_update_xlt(mr, 0,
-					 mlx5_imr_ksm_entries,
-					 MLX5_KSM_PAGE_SHIFT,
-					 MLX5_IB_UPD_XLT_INDIRECT |
-					 MLX5_IB_UPD_XLT_ZAP |
-					 MLX5_IB_UPD_XLT_ENABLE);
-
-	} else {
-		err = mlx5_ib_update_xlt(mr, 0,
-					 MLX5_IMR_MTT_ENTRIES,
-					 PAGE_SHIFT,
-					 MLX5_IB_UPD_XLT_ZAP |
-					 MLX5_IB_UPD_XLT_ENABLE |
-					 MLX5_IB_UPD_XLT_ATOMIC);
-	}
-
-	if (err)
-		goto fail;
-
+	mr->ibmr.pd = imr->ibmr.pd;
+	mr->umem = &odp->umem;
 	mr->ibmr.lkey = mr->mmkey.key;
 	mr->ibmr.rkey = mr->mmkey.key;
+	mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE;
+	mr->parent = imr;
+	odp->private = mr;
 
-	mlx5_ib_dbg(dev, "key %x dev %p mr %p\n",
-		    mr->mmkey.key, dev->mdev, mr);
+	err = mlx5_ib_update_xlt(mr, 0,
+				 MLX5_IMR_MTT_ENTRIES,
+				 PAGE_SHIFT,
+				 MLX5_IB_UPD_XLT_ZAP |
+				 MLX5_IB_UPD_XLT_ENABLE);
+	if (err) {
+		ret = ERR_PTR(err);
+		goto out_mr;
+	}
 
+	/*
+	 * Once the store to either xarray completes any error unwind has to
+	 * use synchronize_srcu(). Avoid this with xa_reserve()
+	 */
+	ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr,
+			 GFP_KERNEL);
+	if (unlikely(ret)) {
+		if (xa_is_err(ret)) {
+			ret = ERR_PTR(xa_err(ret));
+			goto out_mr;
+		}
+		/*
+		 * Another thread beat us to creating the child mr, use
+		 * theirs.
+		 */
+		goto out_mr;
+	}
+
+	mlx5_ib_dbg(imr->dev, "key %x mr %p\n", mr->mmkey.key, mr);
 	return mr;
 
-fail:
-	mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
-	mlx5_mr_cache_free(dev, mr);
-
-	return ERR_PTR(err);
-}
-
-static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr,
-						u64 io_virt, size_t bcnt)
-{
-	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device);
-	struct ib_umem_odp *odp, *result = NULL;
-	struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
-	u64 addr = io_virt & MLX5_IMR_MTT_MASK;
-	int nentries = 0, start_idx = 0, ret;
-	struct mlx5_ib_mr *mtt;
-
-	mutex_lock(&odp_mr->umem_mutex);
-	odp = odp_lookup(addr, 1, mr);
-
-	mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n",
-		    io_virt, bcnt, addr, odp);
-
-next_mr:
-	if (likely(odp)) {
-		if (nentries)
-			nentries++;
-	} else {
-		odp = ib_umem_odp_alloc_child(odp_mr, addr, MLX5_IMR_MTT_SIZE);
-		if (IS_ERR(odp)) {
-			mutex_unlock(&odp_mr->umem_mutex);
-			return ERR_CAST(odp);
-		}
-
-		mtt = implicit_mr_alloc(mr->ibmr.pd, odp, 0,
-					mr->access_flags);
-		if (IS_ERR(mtt)) {
-			mutex_unlock(&odp_mr->umem_mutex);
-			ib_umem_odp_release(odp);
-			return ERR_CAST(mtt);
-		}
-
-		odp->private = mtt;
-		mtt->umem = &odp->umem;
-		mtt->mmkey.iova = addr;
-		mtt->parent = mr;
-		INIT_WORK(&odp->work, mr_leaf_free_action);
-
-		smp_store_release(&mtt->live, 1);
-
-		if (!nentries)
-			start_idx = addr >> MLX5_IMR_MTT_SHIFT;
-		nentries++;
-	}
-
-	/* Return first odp if region not covered by single one */
-	if (likely(!result))
-		result = odp;
-
-	addr += MLX5_IMR_MTT_SIZE;
-	if (unlikely(addr < io_virt + bcnt)) {
-		odp = odp_next(odp);
-		if (odp && ib_umem_start(odp) != addr)
-			odp = NULL;
-		goto next_mr;
-	}
-
-	if (unlikely(nentries)) {
-		ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0,
-					 MLX5_IB_UPD_XLT_INDIRECT |
-					 MLX5_IB_UPD_XLT_ATOMIC);
-		if (ret) {
-			mlx5_ib_err(dev, "Failed to update PAS\n");
-			result = ERR_PTR(ret);
-		}
-	}
-
-	mutex_unlock(&odp_mr->umem_mutex);
-	return result;
+out_mr:
+	mlx5_mr_cache_free(imr->dev, mr);
+out_umem:
+	ib_umem_odp_release(odp);
+	return ret;
 }
 
 struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 					     struct ib_udata *udata,
 					     int access_flags)
 {
-	struct mlx5_ib_mr *imr;
+	struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device);
 	struct ib_umem_odp *umem_odp;
+	struct mlx5_ib_mr *imr;
+	int err;
 
-	umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags);
+	umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags);
 	if (IS_ERR(umem_odp))
 		return ERR_CAST(umem_odp);
 
-	imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags);
+	imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY, access_flags);
 	if (IS_ERR(imr)) {
-		ib_umem_odp_release(umem_odp);
-		return ERR_CAST(imr);
+		err = PTR_ERR(imr);
+		goto out_umem;
 	}
 
+	imr->ibmr.pd = &pd->ibpd;
+	imr->mmkey.iova = 0;
 	imr->umem = &umem_odp->umem;
-	init_waitqueue_head(&imr->q_leaf_free);
-	atomic_set(&imr->num_leaf_free, 0);
-	atomic_set(&imr->num_pending_prefetch, 0);
-	smp_store_release(&imr->live, 1);
+	imr->ibmr.lkey = imr->mmkey.key;
+	imr->ibmr.rkey = imr->mmkey.key;
+	imr->umem = &umem_odp->umem;
+	imr->is_odp_implicit = true;
+	atomic_set(&imr->num_deferred_work, 0);
+	init_waitqueue_head(&imr->q_deferred_work);
+	xa_init(&imr->implicit_children);
 
+	err = mlx5_ib_update_xlt(imr, 0,
+				 mlx5_imr_ksm_entries,
+				 MLX5_KSM_PAGE_SHIFT,
+				 MLX5_IB_UPD_XLT_INDIRECT |
+				 MLX5_IB_UPD_XLT_ZAP |
+				 MLX5_IB_UPD_XLT_ENABLE);
+	if (err)
+		goto out_mr;
+
+	err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key),
+			      &imr->mmkey, GFP_KERNEL));
+	if (err)
+		goto out_mr;
+
+	mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr);
 	return imr;
+out_mr:
+	mlx5_ib_err(dev, "Failed to register MKEY %d\n", err);
+	mlx5_mr_cache_free(dev, imr);
+out_umem:
+	ib_umem_odp_release(umem_odp);
+	return ERR_PTR(err);
 }
 
 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
 {
-	struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr);
-	struct rb_node *node;
+	struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem);
+	struct mlx5_ib_dev *dev = imr->dev;
+	struct list_head destroy_list;
+	struct mlx5_ib_mr *mtt;
+	struct mlx5_ib_mr *tmp;
+	unsigned long idx;
 
-	down_read(&per_mm->umem_rwsem);
-	for (node = rb_first_cached(&per_mm->umem_tree); node;
-	     node = rb_next(node)) {
-		struct ib_umem_odp *umem_odp =
-			rb_entry(node, struct ib_umem_odp, interval_tree.rb);
-		struct mlx5_ib_mr *mr = umem_odp->private;
+	INIT_LIST_HEAD(&destroy_list);
 
-		if (mr->parent != imr)
-			continue;
+	xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key));
+	/*
+	 * This stops the SRCU protected page fault path from touching either
+	 * the imr or any children. The page fault path can only reach the
+	 * children xarray via the imr.
+	 */
+	synchronize_srcu(&dev->odp_srcu);
 
-		mutex_lock(&umem_odp->umem_mutex);
-		ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp),
-					    ib_umem_end(umem_odp));
+	/*
+	 * All work on the prefetch list must be completed, xa_erase() prevented
+	 * new work from being created.
+	 */
+	wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work));
 
-		if (umem_odp->dying) {
-			mutex_unlock(&umem_odp->umem_mutex);
-			continue;
-		}
+	/*
+	 * At this point it is forbidden for any other thread to enter
+	 * pagefault_mr() on this imr. It is already forbidden to call
+	 * pagefault_mr() on an implicit child. Due to this additions to
+	 * implicit_children are prevented.
+	 */
 
-		umem_odp->dying = 1;
-		atomic_inc(&imr->num_leaf_free);
-		schedule_work(&umem_odp->work);
-		mutex_unlock(&umem_odp->umem_mutex);
+	/*
+	 * Block destroy_unused_implicit_child_mr() from incrementing
+	 * num_deferred_work.
+	 */
+	xa_lock(&imr->implicit_children);
+	xa_for_each (&imr->implicit_children, idx, mtt) {
+		__xa_erase(&imr->implicit_children, idx);
+		list_add(&mtt->odp_destroy.elm, &destroy_list);
 	}
-	up_read(&per_mm->umem_rwsem);
+	xa_unlock(&imr->implicit_children);
 
-	wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free));
+	/*
+	 * Wait for any concurrent destroy_unused_implicit_child_mr() to
+	 * complete.
+	 */
+	wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work));
+
+	/*
+	 * Fence the imr before we destroy the children. This allows us to
+	 * skip updating the XLT of the imr during destroy of the child mkey
+	 * the imr points to.
+	 */
+	mlx5_mr_cache_invalidate(imr);
+
+	list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm)
+		free_implicit_child_mr(mtt, false);
+
+	mlx5_mr_cache_free(dev, imr);
+	ib_umem_odp_release(odp_imr);
 }
 
-#define MLX5_PF_FLAGS_PREFETCH  BIT(0)
-#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
-static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
-			u64 io_virt, size_t bcnt, u32 *bytes_mapped,
-			u32 flags)
+/**
+ * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR
+ * @mr: to fence
+ *
+ * On return no parallel threads will be touching this MR and no DMA will be
+ * active.
+ */
+void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr)
 {
-	int npages = 0, current_seq, page_shift, ret, np;
-	struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem);
+	/* Prevent new page faults and prefetch requests from succeeding */
+	xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key));
+
+	/* Wait for all running page-fault handlers to finish. */
+	synchronize_srcu(&mr->dev->odp_srcu);
+
+	wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work));
+
+	dma_fence_odp_mr(mr);
+}
+
+#define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
+#define MLX5_PF_FLAGS_SNAPSHOT BIT(2)
+#define MLX5_PF_FLAGS_ENABLE BIT(3)
+static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
+			     u64 user_va, size_t bcnt, u32 *bytes_mapped,
+			     u32 flags)
+{
+	int page_shift, ret, np;
 	bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
-	bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
 	u64 access_mask;
-	u64 start_idx, page_mask;
-	struct ib_umem_odp *odp;
-	size_t size;
+	u64 start_idx;
+	bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
+	u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
 
-	if (odp_mr->is_implicit_odp) {
-		odp = implicit_mr_get_data(mr, io_virt, bcnt);
-
-		if (IS_ERR(odp))
-			return PTR_ERR(odp);
-		mr = odp->private;
-	} else {
-		odp = odp_mr;
-	}
-
-next_mr:
-	size = min_t(size_t, bcnt, ib_umem_end(odp) - io_virt);
+	if (flags & MLX5_PF_FLAGS_ENABLE)
+		xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
 
 	page_shift = odp->page_shift;
-	page_mask = ~(BIT(page_shift) - 1);
-	start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift;
+	start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
 	access_mask = ODP_READ_ALLOWED_BIT;
 
-	if (prefetch && !downgrade && !odp->umem.writable) {
-		/* prefetch with write-access must
-		 * be supported by the MR
-		 */
-		ret = -EINVAL;
-		goto out;
-	}
-
 	if (odp->umem.writable && !downgrade)
 		access_mask |= ODP_WRITE_ALLOWED_BIT;
 
-	current_seq = READ_ONCE(odp->notifiers_seq);
+	np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
+	if (np < 0)
+		return np;
+
 	/*
-	 * Ensure the sequence number is valid for some time before we call
-	 * gup.
+	 * No need to check whether the MTTs really belong to this MR, since
+	 * ib_umem_odp_map_dma_and_lock already checks this.
 	 */
-	smp_rmb();
-
-	ret = ib_umem_odp_map_dma_pages(odp, io_virt, size, access_mask,
-					current_seq);
-
-	if (ret < 0)
-		goto out;
-
-	np = ret;
-
-	mutex_lock(&odp->umem_mutex);
-	if (!ib_umem_mmu_notifier_retry(odp, current_seq)) {
-		/*
-		 * No need to check whether the MTTs really belong to
-		 * this MR, since ib_umem_odp_map_dma_pages already
-		 * checks this.
-		 */
-		ret = mlx5_ib_update_xlt(mr, start_idx, np,
-					 page_shift, MLX5_IB_UPD_XLT_ATOMIC);
-	} else {
-		ret = -EAGAIN;
-	}
+	ret = mlx5_ib_update_xlt(mr, start_idx, np, page_shift, xlt_flags);
 	mutex_unlock(&odp->umem_mutex);
 
 	if (ret < 0) {
 		if (ret != -EAGAIN)
-			mlx5_ib_err(dev, "Failed to update mkey page tables\n");
+			mlx5_ib_err(mr->dev,
+				    "Failed to update mkey page tables\n");
 		goto out;
 	}
 
 	if (bytes_mapped) {
 		u32 new_mappings = (np << page_shift) -
-			(io_virt - round_down(io_virt, 1 << page_shift));
-		*bytes_mapped += min_t(u32, new_mappings, size);
+			(user_va - round_down(user_va, 1 << page_shift));
+
+		*bytes_mapped += min_t(u32, new_mappings, bcnt);
 	}
 
-	npages += np << (page_shift - PAGE_SHIFT);
-	bcnt -= size;
-
-	if (unlikely(bcnt)) {
-		struct ib_umem_odp *next;
-
-		io_virt += size;
-		next = odp_next(odp);
-		if (unlikely(!next || ib_umem_start(next) != io_virt)) {
-			mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n",
-				    io_virt, next);
-			return -EAGAIN;
-		}
-		odp = next;
-		mr = odp->private;
-		goto next_mr;
-	}
-
-	return npages;
+	return np << (page_shift - PAGE_SHIFT);
 
 out:
-	if (ret == -EAGAIN) {
-		unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
+	return ret;
+}
 
-		if (!wait_for_completion_timeout(&odp->notifier_completion,
-						 timeout)) {
-			mlx5_ib_warn(
-				dev,
-				"timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n",
-				current_seq, odp->notifiers_seq,
-				odp->notifiers_count);
+static int pagefault_implicit_mr(struct mlx5_ib_mr *imr,
+				 struct ib_umem_odp *odp_imr, u64 user_va,
+				 size_t bcnt, u32 *bytes_mapped, u32 flags)
+{
+	unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT;
+	unsigned long upd_start_idx = end_idx + 1;
+	unsigned long upd_len = 0;
+	unsigned long npages = 0;
+	int err;
+	int ret;
+
+	if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE ||
+		     mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt))
+		return -EFAULT;
+
+	/* Fault each child mr that intersects with our interval. */
+	while (bcnt) {
+		unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT;
+		struct ib_umem_odp *umem_odp;
+		struct mlx5_ib_mr *mtt;
+		u64 len;
+
+		mtt = xa_load(&imr->implicit_children, idx);
+		if (unlikely(!mtt)) {
+			mtt = implicit_get_child_mr(imr, idx);
+			if (IS_ERR(mtt)) {
+				ret = PTR_ERR(mtt);
+				goto out;
+			}
+			upd_start_idx = min(upd_start_idx, idx);
+			upd_len = idx - upd_start_idx + 1;
 		}
+
+		umem_odp = to_ib_umem_odp(mtt->umem);
+		len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) -
+		      user_va;
+
+		ret = pagefault_real_mr(mtt, umem_odp, user_va, len,
+					bytes_mapped, flags);
+		if (ret < 0)
+			goto out;
+		user_va += len;
+		bcnt -= len;
+		npages += ret;
 	}
 
+	ret = npages;
+
+	/*
+	 * Any time the implicit_children are changed we must perform an
+	 * update of the xlt before exiting to ensure the HW and the
+	 * implicit_children remains synchronized.
+	 */
+out:
+	if (likely(!upd_len))
+		return ret;
+
+	/*
+	 * Notice this is not strictly ordered right, the KSM is updated after
+	 * the implicit_children is updated, so a parallel page fault could
+	 * see a MR that is not yet visible in the KSM.  This is similar to a
+	 * parallel page fault seeing a MR that is being concurrently removed
+	 * from the KSM. Both of these improbable situations are resolved
+	 * safely by resuming the HW and then taking another page fault. The
+	 * next pagefault handler will see the new information.
+	 */
+	mutex_lock(&odp_imr->umem_mutex);
+	err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0,
+				 MLX5_IB_UPD_XLT_INDIRECT |
+					 MLX5_IB_UPD_XLT_ATOMIC);
+	mutex_unlock(&odp_imr->umem_mutex);
+	if (err) {
+		mlx5_ib_err(imr->dev, "Failed to update PAS\n");
+		return err;
+	}
 	return ret;
 }
 
+/*
+ * Returns:
+ *  -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
+ *           not accessible, or the MR is no longer valid.
+ *  -EAGAIN/-ENOMEM: The operation should be retried
+ *
+ *  -EINVAL/others: General internal malfunction
+ *  >0: Number of pages mapped
+ */
+static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
+			u32 *bytes_mapped, u32 flags)
+{
+	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
+
+	lockdep_assert_held(&mr->dev->odp_srcu);
+	if (unlikely(io_virt < mr->mmkey.iova))
+		return -EFAULT;
+
+	if (!odp->is_implicit_odp) {
+		u64 user_va;
+
+		if (check_add_overflow(io_virt - mr->mmkey.iova,
+				       (u64)odp->umem.address, &user_va))
+			return -EFAULT;
+		if (unlikely(user_va >= ib_umem_end(odp) ||
+			     ib_umem_end(odp) - user_va < bcnt))
+			return -EFAULT;
+		return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped,
+					 flags);
+	}
+	return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped,
+				     flags);
+}
+
+int mlx5_ib_init_odp_mr(struct mlx5_ib_mr *mr, bool enable)
+{
+	u32 flags = MLX5_PF_FLAGS_SNAPSHOT;
+	int ret;
+
+	if (enable)
+		flags |= MLX5_PF_FLAGS_ENABLE;
+
+	ret = pagefault_real_mr(mr, to_ib_umem_odp(mr->umem),
+				mr->umem->address, mr->umem->length, NULL,
+				flags);
+	return ret >= 0 ? 0 : ret;
+}
+
 struct pf_frame {
 	struct pf_frame *next;
 	u32 key;
@@ -775,10 +892,9 @@
 					 struct ib_pd *pd, u32 key,
 					 u64 io_virt, size_t bcnt,
 					 u32 *bytes_committed,
-					 u32 *bytes_mapped, u32 flags)
+					 u32 *bytes_mapped)
 {
 	int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0;
-	bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH;
 	struct pf_frame *head = NULL, *frame;
 	struct mlx5_core_mkey *mmkey;
 	struct mlx5_ib_mr *mr;
@@ -787,58 +903,44 @@
 	size_t offset;
 	int ndescs;
 
-	srcu_key = srcu_read_lock(&dev->mr_srcu);
+	srcu_key = srcu_read_lock(&dev->odp_srcu);
 
 	io_virt += *bytes_committed;
 	bcnt -= *bytes_committed;
 
 next_mr:
-	mmkey = xa_load(&dev->mdev->priv.mkey_table, mlx5_base_mkey(key));
+	mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key));
+	if (!mmkey) {
+		mlx5_ib_dbg(
+			dev,
+			"skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
+			key);
+		if (bytes_mapped)
+			*bytes_mapped += bcnt;
+		/*
+		 * The user could specify a SGL with multiple lkeys and only
+		 * some of them are ODP. Treat the non-ODP ones as fully
+		 * faulted.
+		 */
+		ret = 0;
+		goto srcu_unlock;
+	}
 	if (!mkey_is_eq(mmkey, key)) {
 		mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
 		ret = -EFAULT;
 		goto srcu_unlock;
 	}
 
-	if (prefetch && mmkey->type != MLX5_MKEY_MR) {
-		mlx5_ib_dbg(dev, "prefetch is allowed only for MR\n");
-		ret = -EINVAL;
-		goto srcu_unlock;
-	}
-
 	switch (mmkey->type) {
 	case MLX5_MKEY_MR:
 		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
-		if (!smp_load_acquire(&mr->live) || !mr->ibmr.pd) {
-			mlx5_ib_dbg(dev, "got dead MR\n");
-			ret = -EFAULT;
-			goto srcu_unlock;
-		}
 
-		if (prefetch) {
-			if (!is_odp_mr(mr) ||
-			    mr->ibmr.pd != pd) {
-				mlx5_ib_dbg(dev, "Invalid prefetch request: %s\n",
-					    is_odp_mr(mr) ?  "MR is not ODP" :
-					    "PD is not of the MR");
-				ret = -EINVAL;
-				goto srcu_unlock;
-			}
-		}
-
-		if (!is_odp_mr(mr)) {
-			mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
-				    key);
-			if (bytes_mapped)
-				*bytes_mapped += bcnt;
-			ret = 0;
-			goto srcu_unlock;
-		}
-
-		ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped, flags);
+		ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0);
 		if (ret < 0)
 			goto srcu_unlock;
 
+		mlx5_update_odp_stats(mr, faults, ret);
+
 		npages += ret;
 		ret = 0;
 		break;
@@ -928,7 +1030,7 @@
 	}
 	kfree(out);
 
-	srcu_read_unlock(&dev->mr_srcu, srcu_key);
+	srcu_read_unlock(&dev->odp_srcu, srcu_key);
 	*bytes_committed = 0;
 	return ret ? ret : npages;
 }
@@ -1009,7 +1111,7 @@
 		ret = pagefault_single_data_segment(dev, NULL, key,
 						    io_virt, bcnt,
 						    &pfault->bytes_committed,
-						    bytes_mapped, 0);
+						    bytes_mapped);
 		if (ret < 0)
 			break;
 		npages += ret;
@@ -1054,8 +1156,7 @@
 	if (qp->ibqp.qp_type == IB_QPT_XRC_INI)
 		*wqe += sizeof(struct mlx5_wqe_xrc_seg);
 
-	if (qp->ibqp.qp_type == IB_QPT_UD ||
-	    qp->qp_sub_type == MLX5_IB_QPT_DCI) {
+	if (qp->type == IB_QPT_UD || qp->type == MLX5_IB_QPT_DCI) {
 		av = *wqe;
 		if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV))
 			*wqe += sizeof(struct mlx5_av);
@@ -1108,7 +1209,7 @@
 	struct mlx5_ib_wq *wq = &qp->rq;
 	int wqe_size = 1 << wq->wqe_shift;
 
-	if (qp->wq_sig) {
+	if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) {
 		mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
 		return -EFAULT;
 	}
@@ -1138,7 +1239,7 @@
 	case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE:
 	case MLX5_WQE_PF_TYPE_RESP:
 	case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC:
-		common = mlx5_core_res_hold(dev->mdev, wq_num, MLX5_RES_QP);
+		common = mlx5_core_res_hold(dev, wq_num, MLX5_RES_QP);
 		break;
 	default:
 		break;
@@ -1197,15 +1298,15 @@
 	wqe = wqe_start;
 	qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL;
 	if (qp && sq) {
-		ret = mlx5_ib_read_user_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
-					       &bytes_copied);
+		ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
+					  &bytes_copied);
 		if (ret)
 			goto read_user;
 		ret = mlx5_ib_mr_initiator_pfault_handler(
 			dev, pfault, qp, &wqe, &wqe_end, bytes_copied);
 	} else if (qp && !sq) {
-		ret = mlx5_ib_read_user_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
-					       &bytes_copied);
+		ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
+					  &bytes_copied);
 		if (ret)
 			goto read_user;
 		ret = mlx5_ib_mr_responder_pfault_handler_rq(
@@ -1213,8 +1314,8 @@
 	} else if (!qp) {
 		struct mlx5_ib_srq *srq = res_to_srq(res);
 
-		ret = mlx5_ib_read_user_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
-						&bytes_copied);
+		ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
+					   &bytes_copied);
 		if (ret)
 			goto read_user;
 		ret = mlx5_ib_mr_responder_pfault_handler_srq(
@@ -1292,8 +1393,7 @@
 	}
 
 	ret = pagefault_single_data_segment(dev, NULL, rkey, address, length,
-					    &pfault->bytes_committed, NULL,
-					    0);
+					    &pfault->bytes_committed, NULL);
 	if (ret == -EAGAIN) {
 		/* We're racing with an invalidation, don't prefetch */
 		prefetch_activated = 0;
@@ -1320,8 +1420,7 @@
 
 		ret = pagefault_single_data_segment(dev, NULL, rkey, address,
 						    prefetch_len,
-						    &bytes_committed, NULL,
-						    0);
+						    &bytes_committed, NULL);
 		if (ret < 0 && ret != -EAGAIN) {
 			mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
 				    ret, pfault->token, address, prefetch_len);
@@ -1581,7 +1680,6 @@
 
 static const struct ib_device_ops mlx5_ib_dev_odp_ops = {
 	.advise_mr = mlx5_ib_advise_mr,
-	.invalidate_range = mlx5_ib_invalidate_range,
 };
 
 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev)
@@ -1624,114 +1722,141 @@
 
 struct prefetch_mr_work {
 	struct work_struct work;
-	struct ib_pd *pd;
 	u32 pf_flags;
 	u32 num_sge;
-	struct ib_sge sg_list[0];
+	struct {
+		u64 io_virt;
+		struct mlx5_ib_mr *mr;
+		size_t length;
+	} frags[];
 };
 
-static void num_pending_prefetch_dec(struct mlx5_ib_dev *dev,
-				     struct ib_sge *sg_list, u32 num_sge,
-				     u32 from)
+static void destroy_prefetch_work(struct prefetch_mr_work *work)
 {
 	u32 i;
+
+	for (i = 0; i < work->num_sge; ++i)
+		if (atomic_dec_and_test(&work->frags[i].mr->num_deferred_work))
+			wake_up(&work->frags[i].mr->q_deferred_work);
+	kvfree(work);
+}
+
+static struct mlx5_ib_mr *
+get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
+		    u32 lkey)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_core_mkey *mmkey;
+	struct ib_umem_odp *odp;
+	struct mlx5_ib_mr *mr;
+
+	lockdep_assert_held(&dev->odp_srcu);
+
+	mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey));
+	if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR)
+		return NULL;
+
+	mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+
+	if (mr->ibmr.pd != pd)
+		return NULL;
+
+	odp = to_ib_umem_odp(mr->umem);
+
+	/* prefetch with write-access must be supported by the MR */
+	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE &&
+	    !odp->umem.writable)
+		return NULL;
+
+	return mr;
+}
+
+static void mlx5_ib_prefetch_mr_work(struct work_struct *w)
+{
+	struct prefetch_mr_work *work =
+		container_of(w, struct prefetch_mr_work, work);
+	struct mlx5_ib_dev *dev;
+	u32 bytes_mapped = 0;
 	int srcu_key;
+	int ret;
+	u32 i;
 
-	srcu_key = srcu_read_lock(&dev->mr_srcu);
-
-	for (i = from; i < num_sge; ++i) {
-		struct mlx5_core_mkey *mmkey;
-		struct mlx5_ib_mr *mr;
-
-		mmkey = xa_load(&dev->mdev->priv.mkey_table,
-				mlx5_base_mkey(sg_list[i].lkey));
-		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
-		atomic_dec(&mr->num_pending_prefetch);
+	/* We rely on IB/core that work is executed if we have num_sge != 0 only. */
+	WARN_ON(!work->num_sge);
+	dev = work->frags[0].mr->dev;
+	/* SRCU should be held when calling to mlx5_odp_populate_xlt() */
+	srcu_key = srcu_read_lock(&dev->odp_srcu);
+	for (i = 0; i < work->num_sge; ++i) {
+		ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt,
+				   work->frags[i].length, &bytes_mapped,
+				   work->pf_flags);
+		if (ret <= 0)
+			continue;
+		mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret);
 	}
+	srcu_read_unlock(&dev->odp_srcu, srcu_key);
 
-	srcu_read_unlock(&dev->mr_srcu, srcu_key);
+	destroy_prefetch_work(work);
 }
 
-static bool num_pending_prefetch_inc(struct ib_pd *pd,
-				     struct ib_sge *sg_list, u32 num_sge)
+static bool init_prefetch_work(struct ib_pd *pd,
+			       enum ib_uverbs_advise_mr_advice advice,
+			       u32 pf_flags, struct prefetch_mr_work *work,
+			       struct ib_sge *sg_list, u32 num_sge)
 {
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	bool ret = true;
 	u32 i;
 
+	INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
+	work->pf_flags = pf_flags;
+
 	for (i = 0; i < num_sge; ++i) {
-		struct mlx5_core_mkey *mmkey;
-		struct mlx5_ib_mr *mr;
-
-		mmkey = xa_load(&dev->mdev->priv.mkey_table,
-				mlx5_base_mkey(sg_list[i].lkey));
-		if (!mmkey || mmkey->key != sg_list[i].lkey) {
-			ret = false;
-			break;
+		work->frags[i].io_virt = sg_list[i].addr;
+		work->frags[i].length = sg_list[i].length;
+		work->frags[i].mr =
+			get_prefetchable_mr(pd, advice, sg_list[i].lkey);
+		if (!work->frags[i].mr) {
+			work->num_sge = i;
+			return false;
 		}
 
-		if (mmkey->type != MLX5_MKEY_MR) {
-			ret = false;
-			break;
-		}
-
-		mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
-
-		if (!smp_load_acquire(&mr->live)) {
-			ret = false;
-			break;
-		}
-
-		if (mr->ibmr.pd != pd) {
-			ret = false;
-			break;
-		}
-
-		atomic_inc(&mr->num_pending_prefetch);
+		/* Keep the MR pointer will valid outside the SRCU */
+		atomic_inc(&work->frags[i].mr->num_deferred_work);
 	}
-
-	if (!ret)
-		num_pending_prefetch_dec(dev, sg_list, i, 0);
-
-	return ret;
+	work->num_sge = num_sge;
+	return true;
 }
 
-static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, u32 pf_flags,
-				    struct ib_sge *sg_list, u32 num_sge)
+static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd,
+				    enum ib_uverbs_advise_mr_advice advice,
+				    u32 pf_flags, struct ib_sge *sg_list,
+				    u32 num_sge)
 {
-	u32 i;
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	u32 bytes_mapped = 0;
+	int srcu_key;
 	int ret = 0;
-	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	u32 i;
 
+	srcu_key = srcu_read_lock(&dev->odp_srcu);
 	for (i = 0; i < num_sge; ++i) {
-		struct ib_sge *sg = &sg_list[i];
-		int bytes_committed = 0;
+		struct mlx5_ib_mr *mr;
 
-		ret = pagefault_single_data_segment(dev, pd, sg->lkey, sg->addr,
-						    sg->length,
-						    &bytes_committed, NULL,
-						    pf_flags);
+		mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey);
+		if (!mr) {
+			ret = -ENOENT;
+			goto out;
+		}
+		ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length,
+				   &bytes_mapped, pf_flags);
 		if (ret < 0)
-			break;
+			goto out;
+		mlx5_update_odp_stats(mr, prefetch, ret);
 	}
+	ret = 0;
 
-	return ret < 0 ? ret : 0;
-}
-
-static void mlx5_ib_prefetch_mr_work(struct work_struct *work)
-{
-	struct prefetch_mr_work *w =
-		container_of(work, struct prefetch_mr_work, work);
-
-	if (ib_device_try_get(w->pd->device)) {
-		mlx5_ib_prefetch_sg_list(w->pd, w->pf_flags, w->sg_list,
-					 w->num_sge);
-		ib_device_put(w->pd->device);
-	}
-
-	num_pending_prefetch_dec(to_mdev(w->pd->device), w->sg_list,
-				 w->num_sge, 0);
-	kvfree(w);
+out:
+	srcu_read_unlock(&dev->odp_srcu, srcu_key);
+	return ret;
 }
 
 int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
@@ -1739,43 +1864,31 @@
 			       u32 flags, struct ib_sge *sg_list, u32 num_sge)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	u32 pf_flags = MLX5_PF_FLAGS_PREFETCH;
+	u32 pf_flags = 0;
 	struct prefetch_mr_work *work;
-	bool valid_req;
 	int srcu_key;
 
 	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH)
 		pf_flags |= MLX5_PF_FLAGS_DOWNGRADE;
 
+	if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT)
+		pf_flags |= MLX5_PF_FLAGS_SNAPSHOT;
+
 	if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH)
-		return mlx5_ib_prefetch_sg_list(pd, pf_flags, sg_list,
+		return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list,
 						num_sge);
 
-	work = kvzalloc(struct_size(work, sg_list, num_sge), GFP_KERNEL);
+	work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL);
 	if (!work)
 		return -ENOMEM;
 
-	memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge));
-
-	/* It is guaranteed that the pd when work is executed is the pd when
-	 * work was queued since pd can't be destroyed while it holds MRs and
-	 * destroying a MR leads to flushing the workquque
-	 */
-	work->pd = pd;
-	work->pf_flags = pf_flags;
-	work->num_sge = num_sge;
-
-	INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work);
-
-	srcu_key = srcu_read_lock(&dev->mr_srcu);
-
-	valid_req = num_pending_prefetch_inc(pd, sg_list, num_sge);
-	if (valid_req)
-		queue_work(system_unbound_wq, &work->work);
-	else
-		kvfree(work);
-
-	srcu_read_unlock(&dev->mr_srcu, srcu_key);
-
-	return valid_req ? 0 : -EINVAL;
+	srcu_key = srcu_read_lock(&dev->odp_srcu);
+	if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) {
+		srcu_read_unlock(&dev->odp_srcu, srcu_key);
+		destroy_prefetch_work(work);
+		return -EINVAL;
+	}
+	queue_work(system_unbound_wq, &work->work);
+	srcu_read_unlock(&dev->odp_srcu, srcu_key);
+	return 0;
 }
diff --git a/drivers/infiniband/hw/mlx5/qos.c b/drivers/infiniband/hw/mlx5/qos.c
new file mode 100644
index 0000000..dce9255
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/qos.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2020, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/mlx5_user_ioctl_verbs.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_ib.h"
+
+#define UVERBS_MODULE_NAME mlx5_ib
+#include <rdma/uverbs_named_ioctl.h>
+
+static bool pp_is_supported(struct ib_device *device)
+{
+	struct mlx5_ib_dev *dev = to_mdev(device);
+
+	return (MLX5_CAP_GEN(dev->mdev, qos) &&
+		MLX5_CAP_QOS(dev->mdev, packet_pacing) &&
+		MLX5_CAP_QOS(dev->mdev, packet_pacing_uid));
+}
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_PP_OBJ_ALLOC)(
+	struct uverbs_attr_bundle *attrs)
+{
+	u8 rl_raw[MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)] = {};
+	struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs,
+		MLX5_IB_ATTR_PP_OBJ_ALLOC_HANDLE);
+	struct mlx5_ib_dev *dev;
+	struct mlx5_ib_ucontext *c;
+	struct mlx5_ib_pp *pp_entry;
+	void *in_ctx;
+	u16 uid;
+	int inlen;
+	u32 flags;
+	int err;
+
+	c = to_mucontext(ib_uverbs_get_ucontext(attrs));
+	if (IS_ERR(c))
+		return PTR_ERR(c);
+
+	/* The allocated entry can be used only by a DEVX context */
+	if (!c->devx_uid)
+		return -EINVAL;
+
+	dev = to_mdev(c->ibucontext.device);
+	pp_entry = kzalloc(sizeof(*pp_entry), GFP_KERNEL);
+	if (!pp_entry)
+		return -ENOMEM;
+
+	in_ctx = uverbs_attr_get_alloced_ptr(attrs,
+					     MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX);
+	inlen = uverbs_attr_get_len(attrs,
+				    MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX);
+	memcpy(rl_raw, in_ctx, inlen);
+	err = uverbs_get_flags32(&flags, attrs,
+		MLX5_IB_ATTR_PP_OBJ_ALLOC_FLAGS,
+		MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX);
+	if (err)
+		goto err;
+
+	uid = (flags & MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX) ?
+		c->devx_uid : MLX5_SHARED_RESOURCE_UID;
+
+	err = mlx5_rl_add_rate_raw(dev->mdev, rl_raw, uid,
+			(flags & MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX),
+			&pp_entry->index);
+	if (err)
+		goto err;
+
+	pp_entry->mdev = dev->mdev;
+	uobj->object = pp_entry;
+	uverbs_finalize_uobj_create(attrs, MLX5_IB_ATTR_PP_OBJ_ALLOC_HANDLE);
+
+	err = uverbs_copy_to(attrs, MLX5_IB_ATTR_PP_OBJ_ALLOC_INDEX,
+			     &pp_entry->index, sizeof(pp_entry->index));
+	return err;
+
+err:
+	kfree(pp_entry);
+	return err;
+}
+
+static int pp_obj_cleanup(struct ib_uobject *uobject,
+			  enum rdma_remove_reason why,
+			  struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_ib_pp *pp_entry = uobject->object;
+
+	mlx5_rl_remove_rate_raw(pp_entry->mdev, pp_entry->index);
+	kfree(pp_entry);
+	return 0;
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_PP_OBJ_ALLOC,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_PP_OBJ_ALLOC_HANDLE,
+			MLX5_IB_OBJECT_PP,
+			UVERBS_ACCESS_NEW,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_IN(
+		MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX,
+		UVERBS_ATTR_SIZE(1,
+			MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)),
+		UA_MANDATORY,
+		UA_ALLOC_AND_COPY),
+	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_PP_OBJ_ALLOC_FLAGS,
+			enum mlx5_ib_uapi_pp_alloc_flags,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_PP_OBJ_ALLOC_INDEX,
+			   UVERBS_ATTR_TYPE(u16),
+			   UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_METHOD_DESTROY(
+	MLX5_IB_METHOD_PP_OBJ_DESTROY,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_PP_OBJ_DESTROY_HANDLE,
+			MLX5_IB_OBJECT_PP,
+			UVERBS_ACCESS_DESTROY,
+			UA_MANDATORY));
+
+DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_PP,
+			    UVERBS_TYPE_ALLOC_IDR(pp_obj_cleanup),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_PP_OBJ_ALLOC),
+			    &UVERBS_METHOD(MLX5_IB_METHOD_PP_OBJ_DESTROY));
+
+
+const struct uapi_definition mlx5_ib_qos_defs[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
+		MLX5_IB_OBJECT_PP,
+		UAPI_DEF_IS_OBJ_SUPPORTED(pp_is_supported)),
+	{},
+};
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 4540835..7a2bec0 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -38,10 +38,10 @@
 #include <linux/mlx5/fs.h>
 #include "mlx5_ib.h"
 #include "ib_rep.h"
+#include "counters.h"
 #include "cmd.h"
-
-/* not supported currently */
-static int wq_signature;
+#include "qp.h"
+#include "wr.h"
 
 enum {
 	MLX5_IB_ACK_REQ_FREQ	= 8,
@@ -54,32 +54,6 @@
 	MLX5_IB_LINK_TYPE_ETH		= 1
 };
 
-enum {
-	MLX5_IB_SQ_STRIDE	= 6,
-	MLX5_IB_SQ_UMR_INLINE_THRESHOLD = 64,
-};
-
-static const u32 mlx5_ib_opcode[] = {
-	[IB_WR_SEND]				= MLX5_OPCODE_SEND,
-	[IB_WR_LSO]				= MLX5_OPCODE_LSO,
-	[IB_WR_SEND_WITH_IMM]			= MLX5_OPCODE_SEND_IMM,
-	[IB_WR_RDMA_WRITE]			= MLX5_OPCODE_RDMA_WRITE,
-	[IB_WR_RDMA_WRITE_WITH_IMM]		= MLX5_OPCODE_RDMA_WRITE_IMM,
-	[IB_WR_RDMA_READ]			= MLX5_OPCODE_RDMA_READ,
-	[IB_WR_ATOMIC_CMP_AND_SWP]		= MLX5_OPCODE_ATOMIC_CS,
-	[IB_WR_ATOMIC_FETCH_AND_ADD]		= MLX5_OPCODE_ATOMIC_FA,
-	[IB_WR_SEND_WITH_INV]			= MLX5_OPCODE_SEND_INVAL,
-	[IB_WR_LOCAL_INV]			= MLX5_OPCODE_UMR,
-	[IB_WR_REG_MR]				= MLX5_OPCODE_UMR,
-	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= MLX5_OPCODE_ATOMIC_MASKED_CS,
-	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= MLX5_OPCODE_ATOMIC_MASKED_FA,
-	[MLX5_IB_WR_UMR]			= MLX5_OPCODE_UMR,
-};
-
-struct mlx5_wqe_eth_pad {
-	u8 rsvd0[16];
-};
-
 enum raw_qp_set_mask_map {
 	MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID		= 1UL << 0,
 	MLX5_RAW_QP_RATE_LIMIT			= 1UL << 1,
@@ -129,14 +103,10 @@
  *
  * Return: zero on success, or an error code.
  */
-static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem,
-					void *buffer,
-					u32 buflen,
-					int wqe_index,
-					int wq_offset,
-					int wq_wqe_cnt,
-					int wq_wqe_shift,
-					int bcnt,
+static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem, void *buffer,
+					size_t buflen, int wqe_index,
+					int wq_offset, int wq_wqe_cnt,
+					int wq_wqe_shift, int bcnt,
 					size_t *bytes_copied)
 {
 	size_t offset = wq_offset + ((wqe_index % wq_wqe_cnt) << wq_wqe_shift);
@@ -160,11 +130,43 @@
 	return 0;
 }
 
-int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
-			     int wqe_index,
-			     void *buffer,
-			     int buflen,
-			     size_t *bc)
+static int mlx5_ib_read_kernel_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index,
+				      void *buffer, size_t buflen, size_t *bc)
+{
+	struct mlx5_wqe_ctrl_seg *ctrl;
+	size_t bytes_copied = 0;
+	size_t wqe_length;
+	void *p;
+	int ds;
+
+	wqe_index = wqe_index & qp->sq.fbc.sz_m1;
+
+	/* read the control segment first */
+	p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index);
+	ctrl = p;
+	ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
+	wqe_length = ds * MLX5_WQE_DS_UNITS;
+
+	/* read rest of WQE if it spreads over more than one stride */
+	while (bytes_copied < wqe_length) {
+		size_t copy_length =
+			min_t(size_t, buflen - bytes_copied, MLX5_SEND_WQE_BB);
+
+		if (!copy_length)
+			break;
+
+		memcpy(buffer + bytes_copied, p, copy_length);
+		bytes_copied += copy_length;
+
+		wqe_index = (wqe_index + 1) & qp->sq.fbc.sz_m1;
+		p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index);
+	}
+	*bc = bytes_copied;
+	return 0;
+}
+
+static int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index,
+				    void *buffer, size_t buflen, size_t *bc)
 {
 	struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
 	struct ib_umem *umem = base->ubuffer.umem;
@@ -176,18 +178,10 @@
 	int ret;
 	int ds;
 
-	if (buflen < sizeof(*ctrl))
-		return -EINVAL;
-
 	/* at first read as much as possible */
-	ret = mlx5_ib_read_user_wqe_common(umem,
-					   buffer,
-					   buflen,
-					   wqe_index,
-					   wq->offset,
-					   wq->wqe_cnt,
-					   wq->wqe_shift,
-					   buflen,
+	ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index,
+					   wq->offset, wq->wqe_cnt,
+					   wq->wqe_shift, buflen,
 					   &bytes_copied);
 	if (ret)
 		return ret;
@@ -210,13 +204,9 @@
 	 * so read the remaining bytes starting
 	 * from  wqe_index 0
 	 */
-	ret = mlx5_ib_read_user_wqe_common(umem,
-					   buffer + bytes_copied,
-					   buflen - bytes_copied,
-					   0,
-					   wq->offset,
-					   wq->wqe_cnt,
-					   wq->wqe_shift,
+	ret = mlx5_ib_read_user_wqe_common(umem, buffer + bytes_copied,
+					   buflen - bytes_copied, 0, wq->offset,
+					   wq->wqe_cnt, wq->wqe_shift,
 					   wqe_length - bytes_copied,
 					   &bytes_copied2);
 
@@ -226,11 +216,24 @@
 	return 0;
 }
 
-int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp,
-			     int wqe_index,
-			     void *buffer,
-			     int buflen,
-			     size_t *bc)
+int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
+			size_t buflen, size_t *bc)
+{
+	struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
+	struct ib_umem *umem = base->ubuffer.umem;
+
+	if (buflen < sizeof(struct mlx5_wqe_ctrl_seg))
+		return -EINVAL;
+
+	if (!umem)
+		return mlx5_ib_read_kernel_wqe_sq(qp, wqe_index, buffer,
+						  buflen, bc);
+
+	return mlx5_ib_read_user_wqe_sq(qp, wqe_index, buffer, buflen, bc);
+}
+
+static int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index,
+				    void *buffer, size_t buflen, size_t *bc)
 {
 	struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
 	struct ib_umem *umem = base->ubuffer.umem;
@@ -238,14 +241,9 @@
 	size_t bytes_copied;
 	int ret;
 
-	ret = mlx5_ib_read_user_wqe_common(umem,
-					   buffer,
-					   buflen,
-					   wqe_index,
-					   wq->offset,
-					   wq->wqe_cnt,
-					   wq->wqe_shift,
-					   buflen,
+	ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index,
+					   wq->offset, wq->wqe_cnt,
+					   wq->wqe_shift, buflen,
 					   &bytes_copied);
 
 	if (ret)
@@ -254,25 +252,33 @@
 	return 0;
 }
 
-int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq,
-			      int wqe_index,
-			      void *buffer,
-			      int buflen,
-			      size_t *bc)
+int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
+			size_t buflen, size_t *bc)
+{
+	struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
+	struct ib_umem *umem = base->ubuffer.umem;
+	struct mlx5_ib_wq *wq = &qp->rq;
+	size_t wqe_size = 1 << wq->wqe_shift;
+
+	if (buflen < wqe_size)
+		return -EINVAL;
+
+	if (!umem)
+		return -EOPNOTSUPP;
+
+	return mlx5_ib_read_user_wqe_rq(qp, wqe_index, buffer, buflen, bc);
+}
+
+static int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index,
+				     void *buffer, size_t buflen, size_t *bc)
 {
 	struct ib_umem *umem = srq->umem;
 	size_t bytes_copied;
 	int ret;
 
-	ret = mlx5_ib_read_user_wqe_common(umem,
-					   buffer,
-					   buflen,
-					   wqe_index,
-					   0,
-					   srq->msrq.max,
-					   srq->msrq.wqe_shift,
-					   buflen,
-					   &bytes_copied);
+	ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, 0,
+					   srq->msrq.max, srq->msrq.wqe_shift,
+					   buflen, &bytes_copied);
 
 	if (ret)
 		return ret;
@@ -280,6 +286,21 @@
 	return 0;
 }
 
+int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer,
+			 size_t buflen, size_t *bc)
+{
+	struct ib_umem *umem = srq->umem;
+	size_t wqe_size = 1 << srq->msrq.wqe_shift;
+
+	if (buflen < wqe_size)
+		return -EINVAL;
+
+	if (!umem)
+		return -EOPNOTSUPP;
+
+	return mlx5_ib_read_user_wqe_srq(srq, wqe_index, buffer, buflen, bc);
+}
+
 static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
 {
 	struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
@@ -344,17 +365,26 @@
 		cap->max_recv_wr = 0;
 		cap->max_recv_sge = 0;
 	} else {
+		int wq_sig = !!(qp->flags_en & MLX5_QP_FLAG_SIGNATURE);
+
 		if (ucmd) {
 			qp->rq.wqe_cnt = ucmd->rq_wqe_count;
 			if (ucmd->rq_wqe_shift > BITS_PER_BYTE * sizeof(ucmd->rq_wqe_shift))
 				return -EINVAL;
 			qp->rq.wqe_shift = ucmd->rq_wqe_shift;
-			if ((1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) < qp->wq_sig)
+			if ((1 << qp->rq.wqe_shift) /
+				    sizeof(struct mlx5_wqe_data_seg) <
+			    wq_sig)
 				return -EINVAL;
-			qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig;
+			qp->rq.max_gs =
+				(1 << qp->rq.wqe_shift) /
+					sizeof(struct mlx5_wqe_data_seg) -
+				wq_sig;
 			qp->rq.max_post = qp->rq.wqe_cnt;
 		} else {
-			wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0;
+			wqe_size =
+				wq_sig ? sizeof(struct mlx5_wqe_signature_seg) :
+					 0;
 			wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg);
 			wqe_size = roundup_pow_of_two(wqe_size);
 			wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size;
@@ -368,7 +398,10 @@
 				return -EINVAL;
 			}
 			qp->rq.wqe_shift = ilog2(wqe_size);
-			qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig;
+			qp->rq.max_gs =
+				(1 << qp->rq.wqe_shift) /
+					sizeof(struct mlx5_wqe_data_seg) -
+				wq_sig;
 			qp->rq.max_post = qp->rq.wqe_cnt;
 		}
 	}
@@ -383,7 +416,7 @@
 	switch (attr->qp_type) {
 	case IB_QPT_XRC_INI:
 		size += sizeof(struct mlx5_wqe_xrc_seg);
-		/* fall through */
+		fallthrough;
 	case IB_QPT_RC:
 		size += sizeof(struct mlx5_wqe_ctrl_seg) +
 			max(sizeof(struct mlx5_wqe_atomic_seg) +
@@ -408,7 +441,7 @@
 		if (attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
 			size += sizeof(struct mlx5_wqe_eth_pad) +
 				sizeof(struct mlx5_wqe_eth_seg);
-		/* fall through */
+		fallthrough;
 	case IB_QPT_SMI:
 	case MLX5_IB_QPT_HW_GSI:
 		size += sizeof(struct mlx5_wqe_ctrl_seg) +
@@ -548,7 +581,7 @@
 	}
 
 	if (attr->qp_type == IB_QPT_RAW_PACKET ||
-	    qp->flags & MLX5_IB_QP_UNDERLAY) {
+	    qp->flags & IB_QP_CREATE_SOURCE_QPN) {
 		base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift;
 		qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6;
 	} else {
@@ -650,6 +683,9 @@
 {
 	int bfregn = -ENOMEM;
 
+	if (bfregi->lib_uar_dyn)
+		return -EINVAL;
+
 	mutex_lock(&bfregi->lock);
 	if (bfregi->ver >= 2) {
 		bfregn = alloc_high_class_bfreg(dev, bfregi);
@@ -700,10 +736,7 @@
 	case IB_QPT_SMI:		return MLX5_QP_ST_QP0;
 	case MLX5_IB_QPT_HW_GSI:	return MLX5_QP_ST_QP1;
 	case MLX5_IB_QPT_DCI:		return MLX5_QP_ST_DCI;
-	case IB_QPT_RAW_IPV6:		return MLX5_QP_ST_RAW_IPV6;
-	case IB_QPT_RAW_PACKET:
-	case IB_QPT_RAW_ETHERTYPE:	return MLX5_QP_ST_RAW_ETHERTYPE;
-	case IB_QPT_MAX:
+	case IB_QPT_RAW_PACKET:		return MLX5_QP_ST_RAW_ETHERTYPE;
 	default:		return -EINVAL;
 	}
 }
@@ -721,6 +754,9 @@
 	u32 index_of_sys_page;
 	u32 offset;
 
+	if (bfregi->lib_uar_dyn)
+		return -EINVAL;
+
 	bfregs_per_sys_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k) *
 				MLX5_NON_FP_BFREGS_PER_UAR;
 	index_of_sys_page = bfregn / bfregs_per_sys_page;
@@ -749,7 +785,7 @@
 {
 	int err;
 
-	*umem = ib_umem_get(udata, addr, size, 0, 0);
+	*umem = ib_umem_get(&dev->ib_dev, addr, size, 0);
 	if (IS_ERR(*umem)) {
 		mlx5_ib_dbg(dev, "umem_get failed\n");
 		return PTR_ERR(*umem);
@@ -806,7 +842,7 @@
 	if (!ucmd->buf_addr)
 		return -EINVAL;
 
-	rwq->umem = ib_umem_get(udata, ucmd->buf_addr, rwq->buf_size, 0, 0);
+	rwq->umem = ib_umem_get(&dev->ib_dev, ucmd->buf_addr, rwq->buf_size, 0);
 	if (IS_ERR(rwq->umem)) {
 		mlx5_ib_dbg(dev, "umem_get failed\n");
 		err = PTR_ERR(rwq->umem);
@@ -837,7 +873,6 @@
 		goto err_umem;
 	}
 
-	rwq->create_type = MLX5_WQ_USER;
 	return 0;
 
 err_umem:
@@ -852,15 +887,14 @@
 				bfregn % MLX5_NON_FP_BFREGS_PER_UAR;
 }
 
-static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
-			  struct mlx5_ib_qp *qp, struct ib_udata *udata,
-			  struct ib_qp_init_attr *attr,
-			  u32 **in,
-			  struct mlx5_ib_create_qp_resp *resp, int *inlen,
-			  struct mlx5_ib_qp_base *base)
+static int _create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			   struct mlx5_ib_qp *qp, struct ib_udata *udata,
+			   struct ib_qp_init_attr *attr, u32 **in,
+			   struct mlx5_ib_create_qp_resp *resp, int *inlen,
+			   struct mlx5_ib_qp_base *base,
+			   struct mlx5_ib_create_qp *ucmd)
 {
 	struct mlx5_ib_ucontext *context;
-	struct mlx5_ib_create_qp ucmd;
 	struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer;
 	int page_shift = 0;
 	int uar_index = 0;
@@ -872,33 +906,33 @@
 	void *qpc;
 	int err;
 	u16 uid;
-
-	err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
-	if (err) {
-		mlx5_ib_dbg(dev, "copy failed\n");
-		return err;
-	}
+	u32 uar_flags;
 
 	context = rdma_udata_to_drv_context(udata, struct mlx5_ib_ucontext,
 					    ibucontext);
-	if (ucmd.flags & MLX5_QP_FLAG_BFREG_INDEX) {
+	uar_flags = qp->flags_en &
+		    (MLX5_QP_FLAG_UAR_PAGE_INDEX | MLX5_QP_FLAG_BFREG_INDEX);
+	switch (uar_flags) {
+	case MLX5_QP_FLAG_UAR_PAGE_INDEX:
+		uar_index = ucmd->bfreg_index;
+		bfregn = MLX5_IB_INVALID_BFREG;
+		break;
+	case MLX5_QP_FLAG_BFREG_INDEX:
 		uar_index = bfregn_to_uar_index(dev, &context->bfregi,
-						ucmd.bfreg_index, true);
+						ucmd->bfreg_index, true);
 		if (uar_index < 0)
 			return uar_index;
-
 		bfregn = MLX5_IB_INVALID_BFREG;
-	} else if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) {
-		/*
-		 * TBD: should come from the verbs when we have the API
-		 */
-		/* In CROSS_CHANNEL CQ and QP must use the same UAR */
-		bfregn = MLX5_CROSS_CHANNEL_BFREG;
-	}
-	else {
+		break;
+	case 0:
+		if (qp->flags & IB_QP_CREATE_CROSS_CHANNEL)
+			return -EINVAL;
 		bfregn = alloc_bfreg(dev, &context->bfregi);
 		if (bfregn < 0)
 			return bfregn;
+		break;
+	default:
+		return -EINVAL;
 	}
 
 	mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index);
@@ -910,12 +944,12 @@
 	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
 	qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
 
-	err = set_user_buf_size(dev, qp, &ucmd, base, attr);
+	err = set_user_buf_size(dev, qp, ucmd, base, attr);
 	if (err)
 		goto err_bfreg;
 
-	if (ucmd.buf_addr && ubuffer->buf_size) {
-		ubuffer->buf_addr = ucmd.buf_addr;
+	if (ucmd->buf_addr && ubuffer->buf_size) {
+		ubuffer->buf_addr = ucmd->buf_addr;
 		err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr,
 				       ubuffer->buf_size, &ubuffer->umem,
 				       &npages, &page_shift, &ncont, &offset);
@@ -933,8 +967,7 @@
 		goto err_umem;
 	}
 
-	uid = (attr->qp_type != IB_QPT_XRC_TGT &&
-	       attr->qp_type != IB_QPT_XRC_INI) ? to_mpd(pd)->uid : 0;
+	uid = (attr->qp_type != IB_QPT_XRC_INI) ? to_mpd(pd)->uid : 0;
 	MLX5_SET(create_qp_in, *in, uid, uid);
 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas);
 	if (ubuffer->umem)
@@ -952,24 +985,14 @@
 		resp->bfreg_index = MLX5_IB_INVALID_BFREG;
 	qp->bfregn = bfregn;
 
-	err = mlx5_ib_db_map_user(context, udata, ucmd.db_addr, &qp->db);
+	err = mlx5_ib_db_map_user(context, udata, ucmd->db_addr, &qp->db);
 	if (err) {
 		mlx5_ib_dbg(dev, "map failed\n");
 		goto err_free;
 	}
 
-	err = ib_copy_to_udata(udata, resp, min(udata->outlen, sizeof(*resp)));
-	if (err) {
-		mlx5_ib_dbg(dev, "copy failed\n");
-		goto err_unmap;
-	}
-	qp->create_type = MLX5_QP_USER;
-
 	return 0;
 
-err_unmap:
-	mlx5_ib_db_unmap_user(context, &qp->db);
-
 err_free:
 	kvfree(*in);
 
@@ -982,70 +1005,51 @@
 	return err;
 }
 
-static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd,
-			    struct mlx5_ib_qp *qp, struct mlx5_ib_qp_base *base,
-			    struct ib_udata *udata)
+static void destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+		       struct mlx5_ib_qp_base *base, struct ib_udata *udata)
 {
-	struct mlx5_ib_ucontext *context =
-		rdma_udata_to_drv_context(
-			udata,
-			struct mlx5_ib_ucontext,
-			ibucontext);
+	struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
 
-	mlx5_ib_db_unmap_user(context, &qp->db);
-	ib_umem_release(base->ubuffer.umem);
+	if (udata) {
+		/* User QP */
+		mlx5_ib_db_unmap_user(context, &qp->db);
+		ib_umem_release(base->ubuffer.umem);
 
-	/*
-	 * Free only the BFREGs which are handled by the kernel.
-	 * BFREGs of UARs allocated dynamically are handled by user.
-	 */
-	if (qp->bfregn != MLX5_IB_INVALID_BFREG)
-		mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn);
+		/*
+		 * Free only the BFREGs which are handled by the kernel.
+		 * BFREGs of UARs allocated dynamically are handled by user.
+		 */
+		if (qp->bfregn != MLX5_IB_INVALID_BFREG)
+			mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn);
+		return;
+	}
+
+	/* Kernel QP */
+	kvfree(qp->sq.wqe_head);
+	kvfree(qp->sq.w_list);
+	kvfree(qp->sq.wrid);
+	kvfree(qp->sq.wr_data);
+	kvfree(qp->rq.wrid);
+	if (qp->db.db)
+		mlx5_db_free(dev->mdev, &qp->db);
+	if (qp->buf.frags)
+		mlx5_frag_buf_free(dev->mdev, &qp->buf);
 }
 
-/* get_sq_edge - Get the next nearby edge.
- *
- * An 'edge' is defined as the first following address after the end
- * of the fragment or the SQ. Accordingly, during the WQE construction
- * which repetitively increases the pointer to write the next data, it
- * simply should check if it gets to an edge.
- *
- * @sq - SQ buffer.
- * @idx - Stride index in the SQ buffer.
- *
- * Return:
- *	The new edge.
- */
-static void *get_sq_edge(struct mlx5_ib_wq *sq, u32 idx)
-{
-	void *fragment_end;
-
-	fragment_end = mlx5_frag_buf_get_wqe
-		(&sq->fbc,
-		 mlx5_frag_buf_get_idx_last_contig_stride(&sq->fbc, idx));
-
-	return fragment_end + MLX5_SEND_WQE_BB;
-}
-
-static int create_kernel_qp(struct mlx5_ib_dev *dev,
-			    struct ib_qp_init_attr *init_attr,
-			    struct mlx5_ib_qp *qp,
-			    u32 **in, int *inlen,
-			    struct mlx5_ib_qp_base *base)
+static int _create_kernel_qp(struct mlx5_ib_dev *dev,
+			     struct ib_qp_init_attr *init_attr,
+			     struct mlx5_ib_qp *qp, u32 **in, int *inlen,
+			     struct mlx5_ib_qp_base *base)
 {
 	int uar_index;
 	void *qpc;
 	int err;
 
-	if (init_attr->create_flags & ~(IB_QP_CREATE_INTEGRITY_EN |
-					IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK |
-					IB_QP_CREATE_IPOIB_UD_LSO |
-					IB_QP_CREATE_NETIF_QP |
-					mlx5_ib_create_qp_sqpn_qp1()))
-		return -EINVAL;
-
 	if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR)
 		qp->bf.bfreg = &dev->fp_bfreg;
+	else if (qp->flags & MLX5_IB_QP_CREATE_WC_TEST)
+		qp->bf.bfreg = &dev->wc_bfreg;
 	else
 		qp->bf.bfreg = &dev->bfreg;
 
@@ -1104,10 +1108,8 @@
 	MLX5_SET(qpc, qpc, fre, 1);
 	MLX5_SET(qpc, qpc, rlky, 1);
 
-	if (init_attr->create_flags & mlx5_ib_create_qp_sqpn_qp1()) {
+	if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1)
 		MLX5_SET(qpc, qpc, deth_sqpn, 1);
-		qp->flags |= MLX5_IB_QP_SQPN_QP1;
-	}
 
 	mlx5_fill_page_frag_array(&qp->buf,
 				  (__be64 *)MLX5_ADDR_OF(create_qp_in,
@@ -1135,7 +1137,6 @@
 		err = -ENOMEM;
 		goto err_wrid;
 	}
-	qp->create_type = MLX5_QP_KERNEL;
 
 	return 0;
 
@@ -1155,36 +1156,15 @@
 	return err;
 }
 
-static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
-{
-	kvfree(qp->sq.wqe_head);
-	kvfree(qp->sq.w_list);
-	kvfree(qp->sq.wrid);
-	kvfree(qp->sq.wr_data);
-	kvfree(qp->rq.wrid);
-	mlx5_db_free(dev->mdev, &qp->db);
-	mlx5_frag_buf_free(dev->mdev, &qp->buf);
-}
-
 static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr)
 {
-	if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) ||
-	    (attr->qp_type == MLX5_IB_QPT_DCI) ||
-	    (attr->qp_type == IB_QPT_XRC_INI))
+	if (attr->srq || (qp->type == IB_QPT_XRC_TGT) ||
+	    (qp->type == MLX5_IB_QPT_DCI) || (qp->type == IB_QPT_XRC_INI))
 		return MLX5_SRQ_RQ;
 	else if (!qp->has_rq)
 		return MLX5_ZERO_LEN_RQ;
-	else
-		return MLX5_NON_ZERO_RQ;
-}
 
-static int is_connected(enum ib_qp_type qp_type)
-{
-	if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC ||
-	    qp_type == MLX5_IB_QPT_DCI)
-		return 1;
-
-	return 0;
+	return MLX5_NON_ZERO_RQ;
 }
 
 static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
@@ -1192,15 +1172,15 @@
 				    struct mlx5_ib_sq *sq, u32 tdn,
 				    struct ib_pd *pd)
 {
-	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0};
+	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
 
 	MLX5_SET(create_tis_in, in, uid, to_mpd(pd)->uid);
 	MLX5_SET(tisc, tisc, transport_domain, tdn);
-	if (qp->flags & MLX5_IB_QP_UNDERLAY)
+	if (qp->flags & IB_QP_CREATE_SOURCE_QPN)
 		MLX5_SET(tisc, tisc, underlay_qpn, qp->underlay_qpn);
 
-	return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn);
+	return mlx5_core_create_tis(dev->mdev, in, &sq->tisn);
 }
 
 static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
@@ -1274,7 +1254,7 @@
 	pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
 	mlx5_ib_populate_pas(dev, sq->ubuffer.umem, page_shift, pas, 0);
 
-	err = mlx5_core_create_sq_tracked(dev->mdev, in, inlen, &sq->base.mqp);
+	err = mlx5_core_create_sq_tracked(dev, in, inlen, &sq->base.mqp);
 
 	kvfree(in);
 
@@ -1294,7 +1274,7 @@
 				     struct mlx5_ib_sq *sq)
 {
 	destroy_flow_rule_vport_sq(sq);
-	mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp);
+	mlx5_core_destroy_sq_tracked(dev, &sq->base.mqp);
 	ib_umem_release(sq->ubuffer.umem);
 }
 
@@ -1346,7 +1326,7 @@
 	MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index));
 	MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv));
 
-	if (mqp->flags & MLX5_IB_QP_CAP_SCATTER_FCS)
+	if (mqp->flags & IB_QP_CREATE_SCATTER_FCS)
 		MLX5_SET(rqc, rqc, scatter_fcs, 1);
 
 	wq = MLX5_ADDR_OF(rqc, rqc, wq);
@@ -1364,7 +1344,7 @@
 	qp_pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, qpin, pas);
 	memcpy(pas, qp_pas, rq_pas_size);
 
-	err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rq->base.mqp);
+	err = mlx5_core_create_rq_tracked(dev, in, inlen, &rq->base.mqp);
 
 	kvfree(in);
 
@@ -1374,14 +1354,7 @@
 static void destroy_raw_packet_qp_rq(struct mlx5_ib_dev *dev,
 				     struct mlx5_ib_rq *rq)
 {
-	mlx5_core_destroy_rq_tracked(dev->mdev, &rq->base.mqp);
-}
-
-static bool tunnel_offload_supported(struct mlx5_core_dev *dev)
-{
-	return  (MLX5_CAP_ETH(dev, tunnel_stateless_vxlan) ||
-		 MLX5_CAP_ETH(dev, tunnel_stateless_gre) ||
-		 MLX5_CAP_ETH(dev, tunnel_stateless_geneve_rx));
+	mlx5_core_destroy_rq_tracked(dev, &rq->base.mqp);
 }
 
 static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
@@ -1397,9 +1370,8 @@
 
 static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
 				    struct mlx5_ib_rq *rq, u32 tdn,
-				    u32 *qp_flags_en,
-				    struct ib_pd *pd,
-				    u32 *out, int outlen)
+				    u32 *qp_flags_en, struct ib_pd *pd,
+				    u32 *out)
 {
 	u8 lb_flag = 0;
 	u32 *in;
@@ -1432,9 +1404,8 @@
 	}
 
 	MLX5_SET(tirc, tirc, self_lb_block, lb_flag);
-
-	err = mlx5_core_create_tir_out(dev->mdev, in, inlen, out, outlen);
-
+	MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR);
+	err = mlx5_cmd_exec_inout(dev->mdev, create_tir, in, out);
 	rq->tirn = MLX5_GET(create_tir_out, out, tirn);
 	if (!err && MLX5_GET(tirc, tirc, self_lb_block)) {
 		err = mlx5_ib_enable_lb(dev, false, true);
@@ -1488,17 +1459,16 @@
 	if (qp->rq.wqe_cnt) {
 		rq->base.container_mibqp = qp;
 
-		if (qp->flags & MLX5_IB_QP_CVLAN_STRIPPING)
+		if (qp->flags & IB_QP_CREATE_CVLAN_STRIPPING)
 			rq->flags |= MLX5_IB_RQ_CVLAN_STRIPPING;
-		if (qp->flags & MLX5_IB_QP_PCI_WRITE_END_PADDING)
+		if (qp->flags & IB_QP_CREATE_PCI_WRITE_END_PADDING)
 			rq->flags |= MLX5_IB_RQ_PCI_WRITE_END_PADDING;
 		err = create_raw_packet_qp_rq(dev, rq, in, inlen, pd);
 		if (err)
 			goto err_destroy_sq;
 
-		err = create_raw_packet_qp_tir(
-			dev, rq, tdn, &qp->flags_en, pd, out,
-			MLX5_ST_SZ_BYTES(create_tir_out));
+		err = create_raw_packet_qp_tir(dev, rq, tdn, &qp->flags_en, pd,
+					       out);
 		if (err)
 			goto err_destroy_rq;
 
@@ -1507,7 +1477,8 @@
 			resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_RQN;
 			resp->tirn = rq->tirn;
 			resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN;
-			if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner)) {
+			if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner) ||
+			    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner_v2)) {
 				resp->tir_icm_addr = MLX5_GET(
 					create_tir_out, out, icm_address_31_0);
 				resp->tir_icm_addr |=
@@ -1526,14 +1497,8 @@
 
 	qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn :
 						     rq->base.mqp.qpn;
-	err = ib_copy_to_udata(udata, resp, min(udata->outlen, sizeof(*resp)));
-	if (err)
-		goto err_destroy_tir;
-
 	return 0;
 
-err_destroy_tir:
-	destroy_raw_packet_qp_tir(dev, rq, qp->flags_en, pd);
 err_destroy_rq:
 	destroy_raw_packet_qp_rq(dev, rq);
 err_destroy_sq:
@@ -1585,14 +1550,27 @@
 			     to_mpd(qp->ibqp.pd)->uid);
 }
 
-static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
-				 struct ib_pd *pd,
-				 struct ib_qp_init_attr *init_attr,
-				 struct ib_udata *udata)
+struct mlx5_create_qp_params {
+	struct ib_udata *udata;
+	size_t inlen;
+	size_t outlen;
+	size_t ucmd_size;
+	void *ucmd;
+	u8 is_rss_raw : 1;
+	struct ib_qp_init_attr *attr;
+	u32 uidx;
+	struct mlx5_ib_create_qp_resp resp;
+};
+
+static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+				 struct mlx5_ib_qp *qp,
+				 struct mlx5_create_qp_params *params)
 {
+	struct ib_qp_init_attr *init_attr = params->attr;
+	struct mlx5_ib_create_qp_rss *ucmd = params->ucmd;
+	struct ib_udata *udata = params->udata;
 	struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context(
 		udata, struct mlx5_ib_ucontext, ibucontext);
-	struct mlx5_ib_create_qp_resp resp = {};
 	int inlen;
 	int outlen;
 	int err;
@@ -1602,79 +1580,28 @@
 	void *hfso;
 	u32 selected_fields = 0;
 	u32 outer_l4;
-	size_t min_resp_len;
 	u32 tdn = mucontext->tdn;
-	struct mlx5_ib_create_qp_rss ucmd = {};
-	size_t required_cmd_sz;
 	u8 lb_flag = 0;
 
-	if (init_attr->qp_type != IB_QPT_RAW_PACKET)
-		return -EOPNOTSUPP;
-
-	if (init_attr->create_flags || init_attr->send_cq)
-		return -EINVAL;
-
-	min_resp_len = offsetof(typeof(resp), bfreg_index) + sizeof(resp.bfreg_index);
-	if (udata->outlen < min_resp_len)
-		return -EINVAL;
-
-	required_cmd_sz = offsetof(typeof(ucmd), flags) + sizeof(ucmd.flags);
-	if (udata->inlen < required_cmd_sz) {
-		mlx5_ib_dbg(dev, "invalid inlen\n");
-		return -EINVAL;
-	}
-
-	if (udata->inlen > sizeof(ucmd) &&
-	    !ib_is_udata_cleared(udata, sizeof(ucmd),
-				 udata->inlen - sizeof(ucmd))) {
-		mlx5_ib_dbg(dev, "inlen is not supported\n");
-		return -EOPNOTSUPP;
-	}
-
-	if (ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen))) {
-		mlx5_ib_dbg(dev, "copy failed\n");
-		return -EFAULT;
-	}
-
-	if (ucmd.comp_mask) {
+	if (ucmd->comp_mask) {
 		mlx5_ib_dbg(dev, "invalid comp mask\n");
 		return -EOPNOTSUPP;
 	}
 
-	if (ucmd.flags & ~(MLX5_QP_FLAG_TUNNEL_OFFLOADS |
-			   MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC |
-			   MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)) {
-		mlx5_ib_dbg(dev, "invalid flags\n");
-		return -EOPNOTSUPP;
-	}
-
-	if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS &&
-	    !tunnel_offload_supported(dev->mdev)) {
-		mlx5_ib_dbg(dev, "tunnel offloads isn't supported\n");
-		return -EOPNOTSUPP;
-	}
-
-	if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_INNER &&
-	    !(ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS)) {
+	if (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_INNER &&
+	    !(ucmd->flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS)) {
 		mlx5_ib_dbg(dev, "Tunnel offloads must be set for inner RSS\n");
 		return -EOPNOTSUPP;
 	}
 
-	if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC || dev->is_rep) {
-		lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST;
+	if (dev->is_rep)
 		qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC;
-	}
 
-	if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) {
+	if (qp->flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC)
+		lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST;
+
+	if (qp->flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)
 		lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST;
-		qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC;
-	}
-
-	err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp)));
-	if (err) {
-		mlx5_ib_dbg(dev, "copy failed\n");
-		return -EINVAL;
-	}
 
 	inlen = MLX5_ST_SZ_BYTES(create_tir_in);
 	outlen = MLX5_ST_SZ_BYTES(create_tir_out);
@@ -1693,29 +1620,29 @@
 
 	hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
 
-	if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS)
+	if (ucmd->flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS)
 		MLX5_SET(tirc, tirc, tunneled_offload_en, 1);
 
 	MLX5_SET(tirc, tirc, self_lb_block, lb_flag);
 
-	if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_INNER)
+	if (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_INNER)
 		hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner);
 	else
 		hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
 
-	switch (ucmd.rx_hash_function) {
+	switch (ucmd->rx_hash_function) {
 	case MLX5_RX_HASH_FUNC_TOEPLITZ:
 	{
 		void *rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
 		size_t len = MLX5_FLD_SZ_BYTES(tirc, rx_hash_toeplitz_key);
 
-		if (len != ucmd.rx_key_len) {
+		if (len != ucmd->rx_key_len) {
 			err = -EINVAL;
 			goto err;
 		}
 
 		MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
-		memcpy(rss_key, ucmd.rx_hash_key, len);
+		memcpy(rss_key, ucmd->rx_hash_key, len);
 		break;
 	}
 	default:
@@ -1723,7 +1650,7 @@
 		goto err;
 	}
 
-	if (!ucmd.rx_hash_fields_mask) {
+	if (!ucmd->rx_hash_fields_mask) {
 		/* special case when this TIR serves as steering entry without hashing */
 		if (!init_attr->rwq_ind_tbl->log_ind_tbl_size)
 			goto create_tir;
@@ -1731,29 +1658,31 @@
 		goto err;
 	}
 
-	if (((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
-	     (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) &&
-	     ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) ||
-	     (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))) {
+	if (((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
+	     (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4)) &&
+	     ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) ||
+	     (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))) {
 		err = -EINVAL;
 		goto err;
 	}
 
 	/* If none of IPV4 & IPV6 SRC/DST was set - this bit field is ignored */
-	if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
-	    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4))
+	if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
+	    (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4))
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 			 MLX5_L3_PROT_TYPE_IPV4);
-	else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) ||
-		 (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))
+	else if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6) ||
+		 (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))
 		MLX5_SET(rx_hash_field_select, hfso, l3_prot_type,
 			 MLX5_L3_PROT_TYPE_IPV6);
 
-	outer_l4 = ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
-		    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP)) << 0 |
-		   ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) ||
-		    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP)) << 1 |
-		   (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI) << 2;
+	outer_l4 = ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
+		    (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP))
+			   << 0 |
+		   ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) ||
+		    (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))
+			   << 1 |
+		   (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI) << 2;
 
 	/* Check that only one l4 protocol is set */
 	if (outer_l4 & (outer_l4 - 1)) {
@@ -1762,38 +1691,39 @@
 	}
 
 	/* If none of TCP & UDP SRC/DST was set - this bit field is ignored */
-	if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
-	    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP))
+	if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
+	    (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP))
 		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
 			 MLX5_L4_PROT_TYPE_TCP);
-	else if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) ||
-		 (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))
+	else if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP) ||
+		 (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))
 		MLX5_SET(rx_hash_field_select, hfso, l4_prot_type,
 			 MLX5_L4_PROT_TYPE_UDP);
 
-	if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
-	    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6))
+	if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV4) ||
+	    (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_IPV6))
 		selected_fields |= MLX5_HASH_FIELD_SEL_SRC_IP;
 
-	if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4) ||
-	    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))
+	if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV4) ||
+	    (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_IPV6))
 		selected_fields |= MLX5_HASH_FIELD_SEL_DST_IP;
 
-	if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
-	    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP))
+	if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_TCP) ||
+	    (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_SRC_PORT_UDP))
 		selected_fields |= MLX5_HASH_FIELD_SEL_L4_SPORT;
 
-	if ((ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP) ||
-	    (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))
+	if ((ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_TCP) ||
+	    (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_DST_PORT_UDP))
 		selected_fields |= MLX5_HASH_FIELD_SEL_L4_DPORT;
 
-	if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI)
+	if (ucmd->rx_hash_fields_mask & MLX5_RX_HASH_IPSEC_SPI)
 		selected_fields |= MLX5_HASH_FIELD_SEL_IPSEC_SPI;
 
 	MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields);
 
 create_tir:
-	err = mlx5_core_create_tir_out(dev->mdev, in, inlen, out, outlen);
+	MLX5_SET(create_tir_in, in, opcode, MLX5_CMD_OP_CREATE_TIR);
+	err = mlx5_cmd_exec_inout(dev->mdev, create_tir, in, out);
 
 	qp->rss_qp.tirn = MLX5_GET(create_tir_out, out, tirn);
 	if (!err && MLX5_GET(tirc, tirc, self_lb_block)) {
@@ -1808,75 +1738,45 @@
 		goto err;
 
 	if (mucontext->devx_uid) {
-		resp.comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN;
-		resp.tirn = qp->rss_qp.tirn;
-		if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner)) {
-			resp.tir_icm_addr =
+		params->resp.comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN;
+		params->resp.tirn = qp->rss_qp.tirn;
+		if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner) ||
+		    MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner_v2)) {
+			params->resp.tir_icm_addr =
 				MLX5_GET(create_tir_out, out, icm_address_31_0);
-			resp.tir_icm_addr |= (u64)MLX5_GET(create_tir_out, out,
-							   icm_address_39_32)
-					     << 32;
-			resp.tir_icm_addr |= (u64)MLX5_GET(create_tir_out, out,
-							   icm_address_63_40)
-					     << 40;
-			resp.comp_mask |=
+			params->resp.tir_icm_addr |=
+				(u64)MLX5_GET(create_tir_out, out,
+					      icm_address_39_32)
+				<< 32;
+			params->resp.tir_icm_addr |=
+				(u64)MLX5_GET(create_tir_out, out,
+					      icm_address_63_40)
+				<< 40;
+			params->resp.comp_mask |=
 				MLX5_IB_CREATE_QP_RESP_MASK_TIR_ICM_ADDR;
 		}
 	}
 
-	err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp)));
-	if (err)
-		goto err_copy;
-
 	kvfree(in);
 	/* qpn is reserved for that QP */
 	qp->trans_qp.base.mqp.qpn = 0;
-	qp->flags |= MLX5_IB_QP_RSS;
+	qp->is_rss = true;
 	return 0;
 
-err_copy:
-	mlx5_cmd_destroy_tir(dev->mdev, qp->rss_qp.tirn, mucontext->devx_uid);
 err:
 	kvfree(in);
 	return err;
 }
 
-static void configure_responder_scat_cqe(struct ib_qp_init_attr *init_attr,
-					 void *qpc)
-{
-	int rcqe_sz;
-
-	if (init_attr->qp_type == MLX5_IB_QPT_DCI)
-		return;
-
-	rcqe_sz = mlx5_ib_get_cqe_size(init_attr->recv_cq);
-
-	if (init_attr->qp_type == MLX5_IB_QPT_DCT) {
-		if (rcqe_sz == 128)
-			MLX5_SET(dctc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE);
-
-		return;
-	}
-
-	MLX5_SET(qpc, qpc, cs_res,
-		 rcqe_sz == 128 ? MLX5_RES_SCAT_DATA64_CQE :
-				  MLX5_RES_SCAT_DATA32_CQE);
-}
-
 static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev,
+					 struct mlx5_ib_qp *qp,
 					 struct ib_qp_init_attr *init_attr,
-					 struct mlx5_ib_create_qp *ucmd,
 					 void *qpc)
 {
-	enum ib_qp_type qpt = init_attr->qp_type;
 	int scqe_sz;
-	bool allow_scat_cqe = 0;
+	bool allow_scat_cqe = false;
 
-	if (qpt == IB_QPT_UC || qpt == IB_QPT_UD)
-		return;
-
-	if (ucmd)
-		allow_scat_cqe = ucmd->flags & MLX5_QP_FLAG_ALLOW_SCATTER_CQE;
+	allow_scat_cqe = qp->flags_en & MLX5_QP_FLAG_ALLOW_SCATTER_CQE;
 
 	if (!allow_scat_cqe && init_attr->sq_sig_type != IB_SIGNAL_ALL_WR)
 		return;
@@ -1939,268 +1839,176 @@
 	return atomic_mode;
 }
 
-static inline bool check_flags_mask(uint64_t input, uint64_t supported)
+static int create_xrc_tgt_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+			     struct mlx5_create_qp_params *params)
 {
-	return (input & ~supported) == 0;
+	struct ib_qp_init_attr *attr = params->attr;
+	u32 uidx = params->uidx;
+	struct mlx5_ib_resources *devr = &dev->devr;
+	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
+	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_ib_qp_base *base;
+	unsigned long flags;
+	void *qpc;
+	u32 *in;
+	int err;
+
+	if (attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
+
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+
+	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_XRC);
+	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+	MLX5_SET(qpc, qpc, pd, to_mpd(devr->p0)->pdn);
+
+	if (qp->flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+		MLX5_SET(qpc, qpc, block_lb_mc, 1);
+	if (qp->flags & IB_QP_CREATE_CROSS_CHANNEL)
+		MLX5_SET(qpc, qpc, cd_master, 1);
+	if (qp->flags & IB_QP_CREATE_MANAGED_SEND)
+		MLX5_SET(qpc, qpc, cd_slave_send, 1);
+	if (qp->flags & IB_QP_CREATE_MANAGED_RECV)
+		MLX5_SET(qpc, qpc, cd_slave_receive, 1);
+
+	MLX5_SET(qpc, qpc, rq_type, MLX5_SRQ_RQ);
+	MLX5_SET(qpc, qpc, no_sq, 1);
+	MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn);
+	MLX5_SET(qpc, qpc, cqn_snd, to_mcq(devr->c0)->mcq.cqn);
+	MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s0)->msrq.srqn);
+	MLX5_SET(qpc, qpc, xrcd, to_mxrcd(attr->xrcd)->xrcdn);
+	MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
+
+	/* 0xffffff means we ask to work with cqe version 0 */
+	if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1)
+		MLX5_SET(qpc, qpc, user_index, uidx);
+
+	if (qp->flags & IB_QP_CREATE_PCI_WRITE_END_PADDING) {
+		MLX5_SET(qpc, qpc, end_padding_mode,
+			 MLX5_WQ_END_PAD_MODE_ALIGN);
+		/* Special case to clean flag */
+		qp->flags &= ~IB_QP_CREATE_PCI_WRITE_END_PADDING;
+	}
+
+	base = &qp->trans_qp.base;
+	err = mlx5_qpc_create_qp(dev, &base->mqp, in, inlen, out);
+	kvfree(in);
+	if (err)
+		return err;
+
+	base->container_mibqp = qp;
+	base->mqp.event = mlx5_ib_qp_event;
+	if (MLX5_CAP_GEN(mdev, ece_support))
+		params->resp.ece_options = MLX5_GET(create_qp_out, out, ece);
+
+	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+	list_add_tail(&qp->qps_list, &dev->qp_list);
+	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+
+	qp->trans_qp.xrcdn = to_mxrcd(attr->xrcd)->xrcdn;
+	return 0;
 }
 
-static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
-			    struct ib_qp_init_attr *init_attr,
-			    struct ib_udata *udata, struct mlx5_ib_qp *qp)
+static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			  struct mlx5_ib_qp *qp,
+			  struct mlx5_create_qp_params *params)
 {
+	struct ib_qp_init_attr *init_attr = params->attr;
+	struct mlx5_ib_create_qp *ucmd = params->ucmd;
+	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
+	struct ib_udata *udata = params->udata;
+	u32 uidx = params->uidx;
 	struct mlx5_ib_resources *devr = &dev->devr;
 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
 	struct mlx5_core_dev *mdev = dev->mdev;
-	struct mlx5_ib_create_qp_resp resp = {};
-	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
-		udata, struct mlx5_ib_ucontext, ibucontext);
 	struct mlx5_ib_cq *send_cq;
 	struct mlx5_ib_cq *recv_cq;
 	unsigned long flags;
-	u32 uidx = MLX5_IB_DEFAULT_UIDX;
-	struct mlx5_ib_create_qp ucmd;
 	struct mlx5_ib_qp_base *base;
 	int mlx5_st;
 	void *qpc;
 	u32 *in;
 	int err;
 
-	mutex_init(&qp->mutex);
 	spin_lock_init(&qp->sq.lock);
 	spin_lock_init(&qp->rq.lock);
 
-	mlx5_st = to_mlx5_st(init_attr->qp_type);
+	mlx5_st = to_mlx5_st(qp->type);
 	if (mlx5_st < 0)
 		return -EINVAL;
 
-	if (init_attr->rwq_ind_tbl) {
-		if (!udata)
-			return -ENOSYS;
-
-		err = create_rss_raw_qp_tir(dev, qp, pd, init_attr, udata);
-		return err;
-	}
-
-	if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
-		if (!MLX5_CAP_GEN(mdev, block_lb_mc)) {
-			mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n");
-			return -EINVAL;
-		} else {
-			qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK;
-		}
-	}
-
-	if (init_attr->create_flags &
-			(IB_QP_CREATE_CROSS_CHANNEL |
-			 IB_QP_CREATE_MANAGED_SEND |
-			 IB_QP_CREATE_MANAGED_RECV)) {
-		if (!MLX5_CAP_GEN(mdev, cd)) {
-			mlx5_ib_dbg(dev, "cross-channel isn't supported\n");
-			return -EINVAL;
-		}
-		if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL)
-			qp->flags |= MLX5_IB_QP_CROSS_CHANNEL;
-		if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND)
-			qp->flags |= MLX5_IB_QP_MANAGED_SEND;
-		if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV)
-			qp->flags |= MLX5_IB_QP_MANAGED_RECV;
-	}
-
-	if (init_attr->qp_type == IB_QPT_UD &&
-	    (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO))
-		if (!MLX5_CAP_GEN(mdev, ipoib_basic_offloads)) {
-			mlx5_ib_dbg(dev, "ipoib UD lso qp isn't supported\n");
-			return -EOPNOTSUPP;
-		}
-
-	if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) {
-		if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
-			mlx5_ib_dbg(dev, "Scatter FCS is supported only for Raw Packet QPs");
-			return -EOPNOTSUPP;
-		}
-		if (!MLX5_CAP_GEN(dev->mdev, eth_net_offloads) ||
-		    !MLX5_CAP_ETH(dev->mdev, scatter_fcs)) {
-			mlx5_ib_dbg(dev, "Scatter FCS isn't supported\n");
-			return -EOPNOTSUPP;
-		}
-		qp->flags |= MLX5_IB_QP_CAP_SCATTER_FCS;
-	}
-
 	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
 		qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
 
-	if (init_attr->create_flags & IB_QP_CREATE_CVLAN_STRIPPING) {
-		if (!(MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
-		      MLX5_CAP_ETH(dev->mdev, vlan_cap)) ||
-		    (init_attr->qp_type != IB_QPT_RAW_PACKET))
-			return -EOPNOTSUPP;
-		qp->flags |= MLX5_IB_QP_CVLAN_STRIPPING;
-	}
-
-	if (udata) {
-		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
-			mlx5_ib_dbg(dev, "copy failed\n");
-			return -EFAULT;
-		}
-
-		if (!check_flags_mask(ucmd.flags,
-				      MLX5_QP_FLAG_ALLOW_SCATTER_CQE |
-				      MLX5_QP_FLAG_BFREG_INDEX |
-				      MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE |
-				      MLX5_QP_FLAG_SCATTER_CQE |
-				      MLX5_QP_FLAG_SIGNATURE |
-				      MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC |
-				      MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC |
-				      MLX5_QP_FLAG_TUNNEL_OFFLOADS |
-				      MLX5_QP_FLAG_TYPE_DCI |
-				      MLX5_QP_FLAG_TYPE_DCT))
-			return -EINVAL;
-
-		err = get_qp_user_index(ucontext, &ucmd, udata->inlen, &uidx);
-		if (err)
-			return err;
-
-		qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE);
-		if (MLX5_CAP_GEN(dev->mdev, sctr_data_cqe))
-			qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE);
-		if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) {
-			if (init_attr->qp_type != IB_QPT_RAW_PACKET ||
-			    !tunnel_offload_supported(mdev)) {
-				mlx5_ib_dbg(dev, "Tunnel offload isn't supported\n");
-				return -EOPNOTSUPP;
-			}
-			qp->flags_en |= MLX5_QP_FLAG_TUNNEL_OFFLOADS;
-		}
-
-		if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC) {
-			if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
-				mlx5_ib_dbg(dev, "Self-LB UC isn't supported\n");
-				return -EOPNOTSUPP;
-			}
-			qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC;
-		}
-
-		if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) {
-			if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
-				mlx5_ib_dbg(dev, "Self-LB UM isn't supported\n");
-				return -EOPNOTSUPP;
-			}
-			qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC;
-		}
-
-		if (ucmd.flags & MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE) {
-			if (init_attr->qp_type != IB_QPT_RC ||
-				!MLX5_CAP_GEN(dev->mdev, qp_packet_based)) {
-				mlx5_ib_dbg(dev, "packet based credit mode isn't supported\n");
-				return -EOPNOTSUPP;
-			}
-			qp->flags |= MLX5_IB_QP_PACKET_BASED_CREDIT;
-		}
-
-		if (init_attr->create_flags & IB_QP_CREATE_SOURCE_QPN) {
-			if (init_attr->qp_type != IB_QPT_UD ||
-			    (MLX5_CAP_GEN(dev->mdev, port_type) !=
-			     MLX5_CAP_PORT_TYPE_IB) ||
-			    !mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS)) {
-				mlx5_ib_dbg(dev, "Source QP option isn't supported\n");
-				return -EOPNOTSUPP;
-			}
-
-			qp->flags |= MLX5_IB_QP_UNDERLAY;
-			qp->underlay_qpn = init_attr->source_qpn;
-		}
-	} else {
-		qp->wq_sig = !!wq_signature;
-	}
+	if (qp->flags & IB_QP_CREATE_SOURCE_QPN)
+		qp->underlay_qpn = init_attr->source_qpn;
 
 	base = (init_attr->qp_type == IB_QPT_RAW_PACKET ||
-		qp->flags & MLX5_IB_QP_UNDERLAY) ?
+		qp->flags & IB_QP_CREATE_SOURCE_QPN) ?
 	       &qp->raw_packet_qp.rq.base :
 	       &qp->trans_qp.base;
 
 	qp->has_rq = qp_has_rq(init_attr);
-	err = set_rq_size(dev, &init_attr->cap, qp->has_rq,
-			  qp, udata ? &ucmd : NULL);
+	err = set_rq_size(dev, &init_attr->cap, qp->has_rq, qp, ucmd);
 	if (err) {
 		mlx5_ib_dbg(dev, "err %d\n", err);
 		return err;
 	}
 
-	if (pd) {
-		if (udata) {
-			__u32 max_wqes =
-				1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
-			mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count);
-			if (ucmd.rq_wqe_shift != qp->rq.wqe_shift ||
-			    ucmd.rq_wqe_count != qp->rq.wqe_cnt) {
-				mlx5_ib_dbg(dev, "invalid rq params\n");
-				return -EINVAL;
-			}
-			if (ucmd.sq_wqe_count > max_wqes) {
-				mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n",
-					    ucmd.sq_wqe_count, max_wqes);
-				return -EINVAL;
-			}
-			if (init_attr->create_flags &
-			    mlx5_ib_create_qp_sqpn_qp1()) {
-				mlx5_ib_dbg(dev, "user-space is not allowed to create UD QPs spoofing as QP1\n");
-				return -EINVAL;
-			}
-			err = create_user_qp(dev, pd, qp, udata, init_attr, &in,
-					     &resp, &inlen, base);
-			if (err)
-				mlx5_ib_dbg(dev, "err %d\n", err);
-		} else {
-			err = create_kernel_qp(dev, init_attr, qp, &in, &inlen,
-					       base);
-			if (err)
-				mlx5_ib_dbg(dev, "err %d\n", err);
-		}
+	if (ucmd->rq_wqe_shift != qp->rq.wqe_shift ||
+	    ucmd->rq_wqe_count != qp->rq.wqe_cnt)
+		return -EINVAL;
 
-		if (err)
-			return err;
-	} else {
-		in = kvzalloc(inlen, GFP_KERNEL);
-		if (!in)
-			return -ENOMEM;
+	if (ucmd->sq_wqe_count > (1 << MLX5_CAP_GEN(mdev, log_max_qp_sz)))
+		return -EINVAL;
 
-		qp->create_type = MLX5_QP_EMPTY;
-	}
+	err = _create_user_qp(dev, pd, qp, udata, init_attr, &in, &params->resp,
+			      &inlen, base, ucmd);
+	if (err)
+		return err;
 
 	if (is_sqp(init_attr->qp_type))
 		qp->port = init_attr->port_num;
 
+	if (MLX5_CAP_GEN(mdev, ece_support))
+		MLX5_SET(create_qp_in, in, ece, ucmd->ece_options);
 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
 
 	MLX5_SET(qpc, qpc, st, mlx5_st);
 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+	MLX5_SET(qpc, qpc, pd, to_mpd(pd)->pdn);
 
-	if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR)
-		MLX5_SET(qpc, qpc, pd, to_mpd(pd ? pd : devr->p0)->pdn);
-	else
-		MLX5_SET(qpc, qpc, latency_sensitive, 1);
-
-
-	if (qp->wq_sig)
+	if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE)
 		MLX5_SET(qpc, qpc, wq_signature, 1);
 
-	if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+	if (qp->flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
 		MLX5_SET(qpc, qpc, block_lb_mc, 1);
 
-	if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
+	if (qp->flags & IB_QP_CREATE_CROSS_CHANNEL)
 		MLX5_SET(qpc, qpc, cd_master, 1);
-	if (qp->flags & MLX5_IB_QP_MANAGED_SEND)
+	if (qp->flags & IB_QP_CREATE_MANAGED_SEND)
 		MLX5_SET(qpc, qpc, cd_slave_send, 1);
-	if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
+	if (qp->flags & IB_QP_CREATE_MANAGED_RECV)
 		MLX5_SET(qpc, qpc, cd_slave_receive, 1);
-	if (qp->flags & MLX5_IB_QP_PACKET_BASED_CREDIT)
+	if (qp->flags_en & MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE)
 		MLX5_SET(qpc, qpc, req_e2e_credit_mode, 1);
-	if (qp->scat_cqe && is_connected(init_attr->qp_type)) {
-		configure_responder_scat_cqe(init_attr, qpc);
-		configure_requester_scat_cqe(dev, init_attr,
-					     udata ? &ucmd : NULL,
-					     qpc);
+	if ((qp->flags_en & MLX5_QP_FLAG_SCATTER_CQE) &&
+	    (init_attr->qp_type == IB_QPT_RC ||
+	     init_attr->qp_type == IB_QPT_UC)) {
+		int rcqe_sz = mlx5_ib_get_cqe_size(init_attr->recv_cq);
+
+		MLX5_SET(qpc, qpc, cs_res,
+			 rcqe_sz == 128 ? MLX5_RES_SCAT_DATA64_CQE :
+					  MLX5_RES_SCAT_DATA32_CQE);
 	}
+	if ((qp->flags_en & MLX5_QP_FLAG_SCATTER_CQE) &&
+	    (qp->type == MLX5_IB_QPT_DCI || qp->type == IB_QPT_RC))
+		configure_requester_scat_cqe(dev, qp, init_attr, qpc);
 
 	if (qp->rq.wqe_cnt) {
 		MLX5_SET(qpc, qpc, log_rq_stride, qp->rq.wqe_shift - 4);
@@ -2221,23 +2029,17 @@
 
 	/* Set default resources */
 	switch (init_attr->qp_type) {
-	case IB_QPT_XRC_TGT:
-		MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn);
-		MLX5_SET(qpc, qpc, cqn_snd, to_mcq(devr->c0)->mcq.cqn);
-		MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s0)->msrq.srqn);
-		MLX5_SET(qpc, qpc, xrcd, to_mxrcd(init_attr->xrcd)->xrcdn);
-		break;
 	case IB_QPT_XRC_INI:
 		MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(devr->c0)->mcq.cqn);
-		MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn);
+		MLX5_SET(qpc, qpc, xrcd, devr->xrcdn1);
 		MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s0)->msrq.srqn);
 		break;
 	default:
 		if (init_attr->srq) {
-			MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x0)->xrcdn);
+			MLX5_SET(qpc, qpc, xrcd, devr->xrcdn0);
 			MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(init_attr->srq)->msrq.srqn);
 		} else {
-			MLX5_SET(qpc, qpc, xrcd, to_mxrcd(devr->x1)->xrcdn);
+			MLX5_SET(qpc, qpc, xrcd, devr->xrcdn1);
 			MLX5_SET(qpc, qpc, srqn_rmpn_xrqn, to_msrq(devr->s1)->msrq.srqn);
 		}
 	}
@@ -2254,52 +2056,33 @@
 	if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1)
 		MLX5_SET(qpc, qpc, user_index, uidx);
 
-	/* we use IB_QP_CREATE_IPOIB_UD_LSO to indicates ipoib qp */
-	if (init_attr->qp_type == IB_QPT_UD &&
-	    (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)) {
-		MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1);
-		qp->flags |= MLX5_IB_QP_LSO;
-	}
-
-	if (init_attr->create_flags & IB_QP_CREATE_PCI_WRITE_END_PADDING) {
-		if (!MLX5_CAP_GEN(dev->mdev, end_pad)) {
-			mlx5_ib_dbg(dev, "scatter end padding is not supported\n");
-			err = -EOPNOTSUPP;
-			goto err;
-		} else if (init_attr->qp_type != IB_QPT_RAW_PACKET) {
-			MLX5_SET(qpc, qpc, end_padding_mode,
-				 MLX5_WQ_END_PAD_MODE_ALIGN);
-		} else {
-			qp->flags |= MLX5_IB_QP_PCI_WRITE_END_PADDING;
-		}
-	}
-
-	if (inlen < 0) {
-		err = -EINVAL;
-		goto err;
+	if (qp->flags & IB_QP_CREATE_PCI_WRITE_END_PADDING &&
+	    init_attr->qp_type != IB_QPT_RAW_PACKET) {
+		MLX5_SET(qpc, qpc, end_padding_mode,
+			 MLX5_WQ_END_PAD_MODE_ALIGN);
+		/* Special case to clean flag */
+		qp->flags &= ~IB_QP_CREATE_PCI_WRITE_END_PADDING;
 	}
 
 	if (init_attr->qp_type == IB_QPT_RAW_PACKET ||
-	    qp->flags & MLX5_IB_QP_UNDERLAY) {
-		qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr;
+	    qp->flags & IB_QP_CREATE_SOURCE_QPN) {
+		qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd->sq_buf_addr;
 		raw_packet_qp_copy_info(qp, &qp->raw_packet_qp);
 		err = create_raw_packet_qp(dev, qp, in, inlen, pd, udata,
-					   &resp);
-	} else {
-		err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen);
-	}
-
-	if (err) {
-		mlx5_ib_dbg(dev, "create qp failed\n");
-		goto err_create;
-	}
+					   &params->resp);
+	} else
+		err = mlx5_qpc_create_qp(dev, &base->mqp, in, inlen, out);
 
 	kvfree(in);
+	if (err)
+		goto err_create;
 
 	base->container_mibqp = qp;
 	base->mqp.event = mlx5_ib_qp_event;
+	if (MLX5_CAP_GEN(mdev, ece_support))
+		params->resp.ece_options = MLX5_GET(create_qp_out, out, ece);
 
-	get_cqs(init_attr->qp_type, init_attr->send_cq, init_attr->recv_cq,
+	get_cqs(qp->type, init_attr->send_cq, init_attr->recv_cq,
 		&send_cq, &recv_cq);
 	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
 	mlx5_ib_lock_cqs(send_cq, recv_cq);
@@ -2319,13 +2102,136 @@
 	return 0;
 
 err_create:
-	if (qp->create_type == MLX5_QP_USER)
-		destroy_qp_user(dev, pd, qp, base, udata);
-	else if (qp->create_type == MLX5_QP_KERNEL)
-		destroy_qp_kernel(dev, qp);
+	destroy_qp(dev, qp, base, udata);
+	return err;
+}
 
-err:
+static int create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			    struct mlx5_ib_qp *qp,
+			    struct mlx5_create_qp_params *params)
+{
+	struct ib_qp_init_attr *attr = params->attr;
+	u32 uidx = params->uidx;
+	struct mlx5_ib_resources *devr = &dev->devr;
+	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
+	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_ib_cq *send_cq;
+	struct mlx5_ib_cq *recv_cq;
+	unsigned long flags;
+	struct mlx5_ib_qp_base *base;
+	int mlx5_st;
+	void *qpc;
+	u32 *in;
+	int err;
+
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+
+	mlx5_st = to_mlx5_st(qp->type);
+	if (mlx5_st < 0)
+		return -EINVAL;
+
+	if (attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
+
+	base = &qp->trans_qp.base;
+
+	qp->has_rq = qp_has_rq(attr);
+	err = set_rq_size(dev, &attr->cap, qp->has_rq, qp, NULL);
+	if (err) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		return err;
+	}
+
+	err = _create_kernel_qp(dev, attr, qp, &in, &inlen, base);
+	if (err)
+		return err;
+
+	if (is_sqp(attr->qp_type))
+		qp->port = attr->port_num;
+
+	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+
+	MLX5_SET(qpc, qpc, st, mlx5_st);
+	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
+
+	if (attr->qp_type != MLX5_IB_QPT_REG_UMR)
+		MLX5_SET(qpc, qpc, pd, to_mpd(pd ? pd : devr->p0)->pdn);
+	else
+		MLX5_SET(qpc, qpc, latency_sensitive, 1);
+
+
+	if (qp->flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+		MLX5_SET(qpc, qpc, block_lb_mc, 1);
+
+	if (qp->rq.wqe_cnt) {
+		MLX5_SET(qpc, qpc, log_rq_stride, qp->rq.wqe_shift - 4);
+		MLX5_SET(qpc, qpc, log_rq_size, ilog2(qp->rq.wqe_cnt));
+	}
+
+	MLX5_SET(qpc, qpc, rq_type, get_rx_type(qp, attr));
+
+	if (qp->sq.wqe_cnt)
+		MLX5_SET(qpc, qpc, log_sq_size, ilog2(qp->sq.wqe_cnt));
+	else
+		MLX5_SET(qpc, qpc, no_sq, 1);
+
+	if (attr->srq) {
+		MLX5_SET(qpc, qpc, xrcd, devr->xrcdn0);
+		MLX5_SET(qpc, qpc, srqn_rmpn_xrqn,
+			 to_msrq(attr->srq)->msrq.srqn);
+	} else {
+		MLX5_SET(qpc, qpc, xrcd, devr->xrcdn1);
+		MLX5_SET(qpc, qpc, srqn_rmpn_xrqn,
+			 to_msrq(devr->s1)->msrq.srqn);
+	}
+
+	if (attr->send_cq)
+		MLX5_SET(qpc, qpc, cqn_snd, to_mcq(attr->send_cq)->mcq.cqn);
+
+	if (attr->recv_cq)
+		MLX5_SET(qpc, qpc, cqn_rcv, to_mcq(attr->recv_cq)->mcq.cqn);
+
+	MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
+
+	/* 0xffffff means we ask to work with cqe version 0 */
+	if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1)
+		MLX5_SET(qpc, qpc, user_index, uidx);
+
+	/* we use IB_QP_CREATE_IPOIB_UD_LSO to indicates ipoib qp */
+	if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO)
+		MLX5_SET(qpc, qpc, ulp_stateless_offload_mode, 1);
+
+	err = mlx5_qpc_create_qp(dev, &base->mqp, in, inlen, out);
 	kvfree(in);
+	if (err)
+		goto err_create;
+
+	base->container_mibqp = qp;
+	base->mqp.event = mlx5_ib_qp_event;
+
+	get_cqs(qp->type, attr->send_cq, attr->recv_cq,
+		&send_cq, &recv_cq);
+	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+	mlx5_ib_lock_cqs(send_cq, recv_cq);
+	/* Maintain device to QPs access, needed for further handling via reset
+	 * flow
+	 */
+	list_add_tail(&qp->qps_list, &dev->qp_list);
+	/* Maintain CQ to QPs access, needed for further handling via reset flow
+	 */
+	if (send_cq)
+		list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp);
+	if (recv_cq)
+		list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp);
+	mlx5_ib_unlock_cqs(send_cq, recv_cq);
+	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+
+	return 0;
+
+err_create:
+	destroy_qp(dev, qp, base, NULL);
 	return err;
 }
 
@@ -2387,11 +2293,6 @@
 	}
 }
 
-static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp)
-{
-	return to_mpd(qp->ibqp.pd);
-}
-
 static void get_cqs(enum ib_qp_type qp_type,
 		    struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq,
 		    struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq)
@@ -2412,14 +2313,10 @@
 	case IB_QPT_RC:
 	case IB_QPT_UC:
 	case IB_QPT_UD:
-	case IB_QPT_RAW_IPV6:
-	case IB_QPT_RAW_ETHERTYPE:
 	case IB_QPT_RAW_PACKET:
 		*send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL;
 		*recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL;
 		break;
-
-	case IB_QPT_MAX:
 	default:
 		*send_cq = NULL;
 		*recv_cq = NULL;
@@ -2439,22 +2336,21 @@
 	unsigned long flags;
 	int err;
 
-	if (qp->ibqp.rwq_ind_tbl) {
+	if (qp->is_rss) {
 		destroy_rss_raw_qp_tir(dev, qp);
 		return;
 	}
 
-	base = (qp->ibqp.qp_type == IB_QPT_RAW_PACKET ||
-		qp->flags & MLX5_IB_QP_UNDERLAY) ?
-	       &qp->raw_packet_qp.rq.base :
-	       &qp->trans_qp.base;
+	base = (qp->type == IB_QPT_RAW_PACKET ||
+		qp->flags & IB_QP_CREATE_SOURCE_QPN) ?
+		       &qp->raw_packet_qp.rq.base :
+		       &qp->trans_qp.base;
 
 	if (qp->state != IB_QPS_RESET) {
-		if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET &&
-		    !(qp->flags & MLX5_IB_QP_UNDERLAY)) {
-			err = mlx5_core_qp_modify(dev->mdev,
-						  MLX5_CMD_OP_2RST_QP, 0,
-						  NULL, &base->mqp);
+		if (qp->type != IB_QPT_RAW_PACKET &&
+		    !(qp->flags & IB_QP_CREATE_SOURCE_QPN)) {
+			err = mlx5_core_qp_modify(dev, MLX5_CMD_OP_2RST_QP, 0,
+						  NULL, &base->mqp, NULL);
 		} else {
 			struct mlx5_modify_raw_qp_param raw_qp_param = {
 				.operation = MLX5_CMD_OP_2RST_QP
@@ -2467,8 +2363,8 @@
 				     base->mqp.qpn);
 	}
 
-	get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
-		&send_cq, &recv_cq);
+	get_cqs(qp->type, qp->ibqp.send_cq, qp->ibqp.recv_cq, &send_cq,
+		&recv_cq);
 
 	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
 	mlx5_ib_lock_cqs(send_cq, recv_cq);
@@ -2480,7 +2376,7 @@
 	if (recv_cq)
 		list_del(&qp->cq_recv_list);
 
-	if (qp->create_type == MLX5_QP_KERNEL) {
+	if (!udata) {
 		__mlx5_ib_cq_clean(recv_cq, base->mqp.qpn,
 				   qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
 		if (send_cq != recv_cq)
@@ -2490,264 +2386,465 @@
 	mlx5_ib_unlock_cqs(send_cq, recv_cq);
 	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
 
-	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET ||
-	    qp->flags & MLX5_IB_QP_UNDERLAY) {
+	if (qp->type == IB_QPT_RAW_PACKET ||
+	    qp->flags & IB_QP_CREATE_SOURCE_QPN) {
 		destroy_raw_packet_qp(dev, qp);
 	} else {
-		err = mlx5_core_destroy_qp(dev->mdev, &base->mqp);
+		err = mlx5_core_destroy_qp(dev, &base->mqp);
 		if (err)
 			mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n",
 				     base->mqp.qpn);
 	}
 
-	if (qp->create_type == MLX5_QP_KERNEL)
-		destroy_qp_kernel(dev, qp);
-	else if (qp->create_type == MLX5_QP_USER)
-		destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base, udata);
+	destroy_qp(dev, qp, base, udata);
 }
 
-static const char *ib_qp_type_str(enum ib_qp_type type)
+static int create_dct(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+		      struct mlx5_ib_qp *qp,
+		      struct mlx5_create_qp_params *params)
 {
-	switch (type) {
-	case IB_QPT_SMI:
-		return "IB_QPT_SMI";
-	case IB_QPT_GSI:
-		return "IB_QPT_GSI";
-	case IB_QPT_RC:
-		return "IB_QPT_RC";
-	case IB_QPT_UC:
-		return "IB_QPT_UC";
-	case IB_QPT_UD:
-		return "IB_QPT_UD";
-	case IB_QPT_RAW_IPV6:
-		return "IB_QPT_RAW_IPV6";
-	case IB_QPT_RAW_ETHERTYPE:
-		return "IB_QPT_RAW_ETHERTYPE";
-	case IB_QPT_XRC_INI:
-		return "IB_QPT_XRC_INI";
-	case IB_QPT_XRC_TGT:
-		return "IB_QPT_XRC_TGT";
-	case IB_QPT_RAW_PACKET:
-		return "IB_QPT_RAW_PACKET";
-	case MLX5_IB_QPT_REG_UMR:
-		return "MLX5_IB_QPT_REG_UMR";
-	case IB_QPT_DRIVER:
-		return "IB_QPT_DRIVER";
-	case IB_QPT_MAX:
-	default:
-		return "Invalid QP type";
-	}
-}
-
-static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd,
-					struct ib_qp_init_attr *attr,
-					struct mlx5_ib_create_qp *ucmd,
-					struct ib_udata *udata)
-{
-	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
-		udata, struct mlx5_ib_ucontext, ibucontext);
-	struct mlx5_ib_qp *qp;
-	int err = 0;
-	u32 uidx = MLX5_IB_DEFAULT_UIDX;
+	struct ib_qp_init_attr *attr = params->attr;
+	struct mlx5_ib_create_qp *ucmd = params->ucmd;
+	u32 uidx = params->uidx;
 	void *dctc;
 
-	if (!attr->srq || !attr->recv_cq)
-		return ERR_PTR(-EINVAL);
-
-	err = get_qp_user_index(ucontext, ucmd, sizeof(*ucmd), &uidx);
-	if (err)
-		return ERR_PTR(err);
-
-	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
-	if (!qp)
-		return ERR_PTR(-ENOMEM);
+	if (mlx5_lag_is_active(dev->mdev) && !MLX5_CAP_GEN(dev->mdev, lag_dct))
+		return -EOPNOTSUPP;
 
 	qp->dct.in = kzalloc(MLX5_ST_SZ_BYTES(create_dct_in), GFP_KERNEL);
-	if (!qp->dct.in) {
-		err = -ENOMEM;
-		goto err_free;
-	}
+	if (!qp->dct.in)
+		return -ENOMEM;
 
 	MLX5_SET(create_dct_in, qp->dct.in, uid, to_mpd(pd)->uid);
 	dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry);
-	qp->qp_sub_type = MLX5_IB_QPT_DCT;
 	MLX5_SET(dctc, dctc, pd, to_mpd(pd)->pdn);
 	MLX5_SET(dctc, dctc, srqn_xrqn, to_msrq(attr->srq)->msrq.srqn);
 	MLX5_SET(dctc, dctc, cqn, to_mcq(attr->recv_cq)->mcq.cqn);
 	MLX5_SET64(dctc, dctc, dc_access_key, ucmd->access_key);
 	MLX5_SET(dctc, dctc, user_index, uidx);
+	if (MLX5_CAP_GEN(dev->mdev, ece_support))
+		MLX5_SET(dctc, dctc, ece, ucmd->ece_options);
 
-	if (ucmd->flags & MLX5_QP_FLAG_SCATTER_CQE)
-		configure_responder_scat_cqe(attr, dctc);
+	if (qp->flags_en & MLX5_QP_FLAG_SCATTER_CQE) {
+		int rcqe_sz = mlx5_ib_get_cqe_size(attr->recv_cq);
+
+		if (rcqe_sz == 128)
+			MLX5_SET(dctc, dctc, cs_res, MLX5_RES_SCAT_DATA64_CQE);
+	}
 
 	qp->state = IB_QPS_RESET;
 
-	return &qp->ibqp;
-err_free:
-	kfree(qp);
-	return ERR_PTR(err);
-}
-
-static int set_mlx_qp_type(struct mlx5_ib_dev *dev,
-			   struct ib_qp_init_attr *init_attr,
-			   struct mlx5_ib_create_qp *ucmd,
-			   struct ib_udata *udata)
-{
-	enum { MLX_QP_FLAGS = MLX5_QP_FLAG_TYPE_DCT | MLX5_QP_FLAG_TYPE_DCI };
-	int err;
-
-	if (!udata)
-		return -EINVAL;
-
-	if (udata->inlen < sizeof(*ucmd)) {
-		mlx5_ib_dbg(dev, "create_qp user command is smaller than expected\n");
-		return -EINVAL;
-	}
-	err = ib_copy_from_udata(ucmd, udata, sizeof(*ucmd));
-	if (err)
-		return err;
-
-	if ((ucmd->flags & MLX_QP_FLAGS) == MLX5_QP_FLAG_TYPE_DCI) {
-		init_attr->qp_type = MLX5_IB_QPT_DCI;
-	} else {
-		if ((ucmd->flags & MLX_QP_FLAGS) == MLX5_QP_FLAG_TYPE_DCT) {
-			init_attr->qp_type = MLX5_IB_QPT_DCT;
-		} else {
-			mlx5_ib_dbg(dev, "Invalid QP flags\n");
-			return -EINVAL;
-		}
-	}
-
-	if (!MLX5_CAP_GEN(dev->mdev, dct)) {
-		mlx5_ib_dbg(dev, "DC transport is not supported\n");
-		return -EOPNOTSUPP;
-	}
-
 	return 0;
 }
 
-struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
-				struct ib_qp_init_attr *verbs_init_attr,
-				struct ib_udata *udata)
+static int check_qp_type(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
+			 enum ib_qp_type *type)
 {
-	struct mlx5_ib_dev *dev;
-	struct mlx5_ib_qp *qp;
-	u16 xrcdn = 0;
-	int err;
-	struct ib_qp_init_attr mlx_init_attr;
-	struct ib_qp_init_attr *init_attr = verbs_init_attr;
+	if (attr->qp_type == IB_QPT_DRIVER && !MLX5_CAP_GEN(dev->mdev, dct))
+		goto out;
+
+	switch (attr->qp_type) {
+	case IB_QPT_XRC_TGT:
+	case IB_QPT_XRC_INI:
+		if (!MLX5_CAP_GEN(dev->mdev, xrc))
+			goto out;
+		fallthrough;
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_SMI:
+	case MLX5_IB_QPT_HW_GSI:
+	case IB_QPT_DRIVER:
+	case IB_QPT_GSI:
+	case IB_QPT_RAW_PACKET:
+	case IB_QPT_UD:
+	case MLX5_IB_QPT_REG_UMR:
+		break;
+	default:
+		goto out;
+	}
+
+	*type = attr->qp_type;
+	return 0;
+
+out:
+	mlx5_ib_dbg(dev, "Unsupported QP type %d\n", attr->qp_type);
+	return -EOPNOTSUPP;
+}
+
+static int check_valid_flow(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			    struct ib_qp_init_attr *attr,
+			    struct ib_udata *udata)
+{
 	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
 		udata, struct mlx5_ib_ucontext, ibucontext);
 
-	if (pd) {
-		dev = to_mdev(pd->device);
+	if (!udata) {
+		/* Kernel create_qp callers */
+		if (attr->rwq_ind_tbl)
+			return -EOPNOTSUPP;
 
-		if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
-			if (!ucontext) {
-				mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n");
-				return ERR_PTR(-EINVAL);
-			} else if (!ucontext->cqe_version) {
-				mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n");
-				return ERR_PTR(-EINVAL);
-			}
-		}
-	} else {
-		/* being cautious here */
-		if (init_attr->qp_type != IB_QPT_XRC_TGT &&
-		    init_attr->qp_type != MLX5_IB_QPT_REG_UMR) {
-			pr_warn("%s: no PD for transport %s\n", __func__,
-				ib_qp_type_str(init_attr->qp_type));
-			return ERR_PTR(-EINVAL);
-		}
-		dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device);
-	}
-
-	if (init_attr->qp_type == IB_QPT_DRIVER) {
-		struct mlx5_ib_create_qp ucmd;
-
-		init_attr = &mlx_init_attr;
-		memcpy(init_attr, verbs_init_attr, sizeof(*verbs_init_attr));
-		err = set_mlx_qp_type(dev, init_attr, &ucmd, udata);
-		if (err)
-			return ERR_PTR(err);
-
-		if (init_attr->qp_type == MLX5_IB_QPT_DCI) {
-			if (init_attr->cap.max_recv_wr ||
-			    init_attr->cap.max_recv_sge) {
-				mlx5_ib_dbg(dev, "DCI QP requires zero size receive queue\n");
-				return ERR_PTR(-EINVAL);
-			}
-		} else {
-			return mlx5_ib_create_dct(pd, init_attr, &ucmd, udata);
+		switch (attr->qp_type) {
+		case IB_QPT_RAW_PACKET:
+		case IB_QPT_DRIVER:
+			return -EOPNOTSUPP;
+		default:
+			return 0;
 		}
 	}
 
-	switch (init_attr->qp_type) {
-	case IB_QPT_XRC_TGT:
-	case IB_QPT_XRC_INI:
-		if (!MLX5_CAP_GEN(dev->mdev, xrc)) {
-			mlx5_ib_dbg(dev, "XRC not supported\n");
-			return ERR_PTR(-ENOSYS);
-		}
-		init_attr->recv_cq = NULL;
-		if (init_attr->qp_type == IB_QPT_XRC_TGT) {
-			xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
-			init_attr->send_cq = NULL;
-		}
+	/* Userspace create_qp callers */
+	if (attr->qp_type == IB_QPT_RAW_PACKET && !ucontext->cqe_version) {
+		mlx5_ib_dbg(dev,
+			"Raw Packet QP is only supported for CQE version > 0\n");
+		return -EINVAL;
+	}
 
-		/* fall through */
-	case IB_QPT_RAW_PACKET:
-	case IB_QPT_RC:
-	case IB_QPT_UC:
-	case IB_QPT_UD:
-	case IB_QPT_SMI:
-	case MLX5_IB_QPT_HW_GSI:
-	case MLX5_IB_QPT_REG_UMR:
-	case MLX5_IB_QPT_DCI:
-		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
-		if (!qp)
-			return ERR_PTR(-ENOMEM);
+	if (attr->qp_type != IB_QPT_RAW_PACKET && attr->rwq_ind_tbl) {
+		mlx5_ib_dbg(dev,
+			    "Wrong QP type %d for the RWQ indirect table\n",
+			    attr->qp_type);
+		return -EINVAL;
+	}
 
-		err = create_qp_common(dev, pd, init_attr, udata, qp);
-		if (err) {
-			mlx5_ib_dbg(dev, "create_qp_common failed\n");
-			kfree(qp);
-			return ERR_PTR(err);
-		}
+	/*
+	 * We don't need to see this warning, it means that kernel code
+	 * missing ib_pd. Placed here to catch developer's mistakes.
+	 */
+	WARN_ONCE(!pd && attr->qp_type != IB_QPT_XRC_TGT,
+		  "There is a missing PD pointer assignment\n");
+	return 0;
+}
 
-		if (is_qp0(init_attr->qp_type))
-			qp->ibqp.qp_num = 0;
-		else if (is_qp1(init_attr->qp_type))
-			qp->ibqp.qp_num = 1;
-		else
-			qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn;
+static void process_vendor_flag(struct mlx5_ib_dev *dev, int *flags, int flag,
+				bool cond, struct mlx5_ib_qp *qp)
+{
+	if (!(*flags & flag))
+		return;
 
-		mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n",
-			    qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn,
-			    init_attr->recv_cq ? to_mcq(init_attr->recv_cq)->mcq.cqn : -1,
-			    init_attr->send_cq ? to_mcq(init_attr->send_cq)->mcq.cqn : -1);
+	if (cond) {
+		qp->flags_en |= flag;
+		*flags &= ~flag;
+		return;
+	}
 
-		qp->trans_qp.xrcdn = xrcdn;
-
-		break;
-
-	case IB_QPT_GSI:
-		return mlx5_ib_gsi_create_qp(pd, init_attr);
-
-	case IB_QPT_RAW_IPV6:
-	case IB_QPT_RAW_ETHERTYPE:
-	case IB_QPT_MAX:
+	switch (flag) {
+	case MLX5_QP_FLAG_SCATTER_CQE:
+	case MLX5_QP_FLAG_ALLOW_SCATTER_CQE:
+		/*
+			 * We don't return error if these flags were provided,
+			 * and mlx5 doesn't have right capability.
+			 */
+		*flags &= ~(MLX5_QP_FLAG_SCATTER_CQE |
+			    MLX5_QP_FLAG_ALLOW_SCATTER_CQE);
+		return;
 	default:
-		mlx5_ib_dbg(dev, "unsupported qp type %d\n",
-			    init_attr->qp_type);
-		/* Don't support raw QPs */
-		return ERR_PTR(-EINVAL);
+		break;
+	}
+	mlx5_ib_dbg(dev, "Vendor create QP flag 0x%X is not supported\n", flag);
+}
+
+static int process_vendor_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+				void *ucmd, struct ib_qp_init_attr *attr)
+{
+	struct mlx5_core_dev *mdev = dev->mdev;
+	bool cond;
+	int flags;
+
+	if (attr->rwq_ind_tbl)
+		flags = ((struct mlx5_ib_create_qp_rss *)ucmd)->flags;
+	else
+		flags = ((struct mlx5_ib_create_qp *)ucmd)->flags;
+
+	switch (flags & (MLX5_QP_FLAG_TYPE_DCT | MLX5_QP_FLAG_TYPE_DCI)) {
+	case MLX5_QP_FLAG_TYPE_DCI:
+		qp->type = MLX5_IB_QPT_DCI;
+		break;
+	case MLX5_QP_FLAG_TYPE_DCT:
+		qp->type = MLX5_IB_QPT_DCT;
+		break;
+	default:
+		if (qp->type != IB_QPT_DRIVER)
+			break;
+		/*
+		 * It is IB_QPT_DRIVER and or no subtype or
+		 * wrong subtype were provided.
+		 */
+		return -EINVAL;
 	}
 
-	if (verbs_init_attr->qp_type == IB_QPT_DRIVER)
-		qp->qp_sub_type = init_attr->qp_type;
+	process_vendor_flag(dev, &flags, MLX5_QP_FLAG_TYPE_DCI, true, qp);
+	process_vendor_flag(dev, &flags, MLX5_QP_FLAG_TYPE_DCT, true, qp);
 
-	return &qp->ibqp;
+	process_vendor_flag(dev, &flags, MLX5_QP_FLAG_SIGNATURE, true, qp);
+	process_vendor_flag(dev, &flags, MLX5_QP_FLAG_SCATTER_CQE,
+			    MLX5_CAP_GEN(mdev, sctr_data_cqe), qp);
+	process_vendor_flag(dev, &flags, MLX5_QP_FLAG_ALLOW_SCATTER_CQE,
+			    MLX5_CAP_GEN(mdev, sctr_data_cqe), qp);
+
+	if (qp->type == IB_QPT_RAW_PACKET) {
+		cond = MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan) ||
+		       MLX5_CAP_ETH(mdev, tunnel_stateless_gre) ||
+		       MLX5_CAP_ETH(mdev, tunnel_stateless_geneve_rx);
+		process_vendor_flag(dev, &flags, MLX5_QP_FLAG_TUNNEL_OFFLOADS,
+				    cond, qp);
+		process_vendor_flag(dev, &flags,
+				    MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC, true,
+				    qp);
+		process_vendor_flag(dev, &flags,
+				    MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC, true,
+				    qp);
+	}
+
+	if (qp->type == IB_QPT_RC)
+		process_vendor_flag(dev, &flags,
+				    MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE,
+				    MLX5_CAP_GEN(mdev, qp_packet_based), qp);
+
+	process_vendor_flag(dev, &flags, MLX5_QP_FLAG_BFREG_INDEX, true, qp);
+	process_vendor_flag(dev, &flags, MLX5_QP_FLAG_UAR_PAGE_INDEX, true, qp);
+
+	cond = qp->flags_en & ~(MLX5_QP_FLAG_TUNNEL_OFFLOADS |
+				MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC |
+				MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC);
+	if (attr->rwq_ind_tbl && cond) {
+		mlx5_ib_dbg(dev, "RSS RAW QP has unsupported flags 0x%X\n",
+			    cond);
+		return -EINVAL;
+	}
+
+	if (flags)
+		mlx5_ib_dbg(dev, "udata has unsupported flags 0x%X\n", flags);
+
+	return (flags) ? -EINVAL : 0;
+	}
+
+static void process_create_flag(struct mlx5_ib_dev *dev, int *flags, int flag,
+				bool cond, struct mlx5_ib_qp *qp)
+{
+	if (!(*flags & flag))
+		return;
+
+	if (cond) {
+		qp->flags |= flag;
+		*flags &= ~flag;
+		return;
+	}
+
+	if (flag == MLX5_IB_QP_CREATE_WC_TEST) {
+		/*
+		 * Special case, if condition didn't meet, it won't be error,
+		 * just different in-kernel flow.
+		 */
+		*flags &= ~MLX5_IB_QP_CREATE_WC_TEST;
+		return;
+	}
+	mlx5_ib_dbg(dev, "Verbs create QP flag 0x%X is not supported\n", flag);
+}
+
+static int process_create_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+				struct ib_qp_init_attr *attr)
+{
+	enum ib_qp_type qp_type = qp->type;
+	struct mlx5_core_dev *mdev = dev->mdev;
+	int create_flags = attr->create_flags;
+	bool cond;
+
+	if (qp_type == MLX5_IB_QPT_DCT)
+		return (create_flags) ? -EINVAL : 0;
+
+	if (qp_type == IB_QPT_RAW_PACKET && attr->rwq_ind_tbl)
+		return (create_flags) ? -EINVAL : 0;
+
+	process_create_flag(dev, &create_flags, IB_QP_CREATE_NETIF_QP,
+			    mlx5_get_flow_namespace(dev->mdev,
+						    MLX5_FLOW_NAMESPACE_BYPASS),
+			    qp);
+	process_create_flag(dev, &create_flags,
+			    IB_QP_CREATE_INTEGRITY_EN,
+			    MLX5_CAP_GEN(mdev, sho), qp);
+	process_create_flag(dev, &create_flags,
+			    IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
+			    MLX5_CAP_GEN(mdev, block_lb_mc), qp);
+	process_create_flag(dev, &create_flags, IB_QP_CREATE_CROSS_CHANNEL,
+			    MLX5_CAP_GEN(mdev, cd), qp);
+	process_create_flag(dev, &create_flags, IB_QP_CREATE_MANAGED_SEND,
+			    MLX5_CAP_GEN(mdev, cd), qp);
+	process_create_flag(dev, &create_flags, IB_QP_CREATE_MANAGED_RECV,
+			    MLX5_CAP_GEN(mdev, cd), qp);
+
+	if (qp_type == IB_QPT_UD) {
+		process_create_flag(dev, &create_flags,
+				    IB_QP_CREATE_IPOIB_UD_LSO,
+				    MLX5_CAP_GEN(mdev, ipoib_basic_offloads),
+				    qp);
+		cond = MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_IB;
+		process_create_flag(dev, &create_flags, IB_QP_CREATE_SOURCE_QPN,
+				    cond, qp);
+	}
+
+	if (qp_type == IB_QPT_RAW_PACKET) {
+		cond = MLX5_CAP_GEN(mdev, eth_net_offloads) &&
+		       MLX5_CAP_ETH(mdev, scatter_fcs);
+		process_create_flag(dev, &create_flags,
+				    IB_QP_CREATE_SCATTER_FCS, cond, qp);
+
+		cond = MLX5_CAP_GEN(mdev, eth_net_offloads) &&
+		       MLX5_CAP_ETH(mdev, vlan_cap);
+		process_create_flag(dev, &create_flags,
+				    IB_QP_CREATE_CVLAN_STRIPPING, cond, qp);
+	}
+
+	process_create_flag(dev, &create_flags,
+			    IB_QP_CREATE_PCI_WRITE_END_PADDING,
+			    MLX5_CAP_GEN(mdev, end_pad), qp);
+
+	process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_WC_TEST,
+			    qp_type != MLX5_IB_QPT_REG_UMR, qp);
+	process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_SQPN_QP1,
+			    true, qp);
+
+	if (create_flags)
+		mlx5_ib_dbg(dev, "Create QP has unsupported flags 0x%X\n",
+			    create_flags);
+
+	return (create_flags) ? -EINVAL : 0;
+}
+
+static int process_udata_size(struct mlx5_ib_dev *dev,
+			      struct mlx5_create_qp_params *params)
+{
+	size_t ucmd = sizeof(struct mlx5_ib_create_qp);
+	struct ib_udata *udata = params->udata;
+	size_t outlen = udata->outlen;
+	size_t inlen = udata->inlen;
+
+	params->outlen = min(outlen, sizeof(struct mlx5_ib_create_qp_resp));
+	params->ucmd_size = ucmd;
+	if (!params->is_rss_raw) {
+		/* User has old rdma-core, which doesn't support ECE */
+		size_t min_inlen =
+			offsetof(struct mlx5_ib_create_qp, ece_options);
+
+		/*
+		 * We will check in check_ucmd_data() that user
+		 * cleared everything after inlen.
+		 */
+		params->inlen = (inlen < min_inlen) ? 0 : min(inlen, ucmd);
+		goto out;
+	}
+
+	/* RSS RAW QP */
+	if (inlen < offsetofend(struct mlx5_ib_create_qp_rss, flags))
+		return -EINVAL;
+
+	if (outlen < offsetofend(struct mlx5_ib_create_qp_resp, bfreg_index))
+		return -EINVAL;
+
+	ucmd = sizeof(struct mlx5_ib_create_qp_rss);
+	params->ucmd_size = ucmd;
+	if (inlen > ucmd && !ib_is_udata_cleared(udata, ucmd, inlen - ucmd))
+		return -EINVAL;
+
+	params->inlen = min(ucmd, inlen);
+out:
+	if (!params->inlen)
+		mlx5_ib_dbg(dev, "udata is too small\n");
+
+	return (params->inlen) ? 0 : -EINVAL;
+}
+
+static int create_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+		     struct mlx5_ib_qp *qp,
+		     struct mlx5_create_qp_params *params)
+{
+	int err;
+
+	if (params->is_rss_raw) {
+		err = create_rss_raw_qp_tir(dev, pd, qp, params);
+		goto out;
+	}
+
+	switch (qp->type) {
+	case MLX5_IB_QPT_DCT:
+		err = create_dct(dev, pd, qp, params);
+		break;
+	case IB_QPT_XRC_TGT:
+		err = create_xrc_tgt_qp(dev, qp, params);
+		break;
+	case IB_QPT_GSI:
+		err = mlx5_ib_create_gsi(pd, qp, params->attr);
+		break;
+	default:
+		if (params->udata)
+			err = create_user_qp(dev, pd, qp, params);
+		else
+			err = create_kernel_qp(dev, pd, qp, params);
+	}
+
+out:
+	if (err) {
+		mlx5_ib_err(dev, "Create QP type %d failed\n", qp->type);
+		return err;
+	}
+
+	if (is_qp0(qp->type))
+		qp->ibqp.qp_num = 0;
+	else if (is_qp1(qp->type))
+		qp->ibqp.qp_num = 1;
+	else
+		qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn;
+
+	mlx5_ib_dbg(dev,
+		"QP type %d, ib qpn 0x%X, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x, ece 0x%x\n",
+		qp->type, qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn,
+		params->attr->recv_cq ? to_mcq(params->attr->recv_cq)->mcq.cqn :
+					-1,
+		params->attr->send_cq ? to_mcq(params->attr->send_cq)->mcq.cqn :
+					-1,
+		params->resp.ece_options);
+
+	return 0;
+}
+
+static int check_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+			 struct ib_qp_init_attr *attr)
+{
+	int ret = 0;
+
+	switch (qp->type) {
+	case MLX5_IB_QPT_DCT:
+		ret = (!attr->srq || !attr->recv_cq) ? -EINVAL : 0;
+		break;
+	case MLX5_IB_QPT_DCI:
+		ret = (attr->cap.max_recv_wr || attr->cap.max_recv_sge) ?
+			      -EINVAL :
+			      0;
+		break;
+	case IB_QPT_RAW_PACKET:
+		ret = (attr->rwq_ind_tbl && attr->send_cq) ? -EINVAL : 0;
+		break;
+	default:
+		break;
+	}
+
+	if (ret)
+		mlx5_ib_dbg(dev, "QP type %d has wrong attributes\n", qp->type);
+
+	return ret;
+}
+
+static int get_qp_uidx(struct mlx5_ib_qp *qp,
+		       struct mlx5_create_qp_params *params)
+{
+	struct mlx5_ib_create_qp *ucmd = params->ucmd;
+	struct ib_udata *udata = params->udata;
+	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
+
+	if (params->is_rss_raw)
+		return 0;
+
+	return get_qp_user_index(ucontext, ucmd, sizeof(*ucmd), &params->uidx);
 }
 
 static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp)
@@ -2757,7 +2854,7 @@
 	if (mqp->state == IB_QPS_RTR) {
 		int err;
 
-		err = mlx5_core_destroy_dct(dev->mdev, &mqp->dct.mdct);
+		err = mlx5_core_destroy_dct(dev, &mqp->dct.mdct);
 		if (err) {
 			mlx5_ib_warn(dev, "failed to destroy DCT %d\n", err);
 			return err;
@@ -2769,15 +2866,163 @@
 	return 0;
 }
 
+static int check_ucmd_data(struct mlx5_ib_dev *dev,
+			   struct mlx5_create_qp_params *params)
+{
+	struct ib_udata *udata = params->udata;
+	size_t size, last;
+	int ret;
+
+	if (params->is_rss_raw)
+		/*
+		 * These QPs don't have "reserved" field in their
+		 * create_qp input struct, so their data is always valid.
+		 */
+		last = sizeof(struct mlx5_ib_create_qp_rss);
+	else
+		last = offsetof(struct mlx5_ib_create_qp, reserved);
+
+	if (udata->inlen <= last)
+		return 0;
+
+	/*
+	 * User provides different create_qp structures based on the
+	 * flow and we need to know if he cleared memory after our
+	 * struct create_qp ends.
+	 */
+	size = udata->inlen - last;
+	ret = ib_is_udata_cleared(params->udata, last, size);
+	if (!ret)
+		mlx5_ib_dbg(
+			dev,
+			"udata is not cleared, inlen = %zu, ucmd = %zu, last = %zu, size = %zu\n",
+			udata->inlen, params->ucmd_size, last, size);
+	return ret ? 0 : -EINVAL;
+}
+
+struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attr,
+				struct ib_udata *udata)
+{
+	struct mlx5_create_qp_params params = {};
+	struct mlx5_ib_dev *dev;
+	struct mlx5_ib_qp *qp;
+	enum ib_qp_type type;
+	int err;
+
+	dev = pd ? to_mdev(pd->device) :
+		   to_mdev(to_mxrcd(attr->xrcd)->ibxrcd.device);
+
+	err = check_qp_type(dev, attr, &type);
+	if (err)
+		return ERR_PTR(err);
+
+	err = check_valid_flow(dev, pd, attr, udata);
+	if (err)
+		return ERR_PTR(err);
+
+	params.udata = udata;
+	params.uidx = MLX5_IB_DEFAULT_UIDX;
+	params.attr = attr;
+	params.is_rss_raw = !!attr->rwq_ind_tbl;
+
+	if (udata) {
+		err = process_udata_size(dev, &params);
+		if (err)
+			return ERR_PTR(err);
+
+		err = check_ucmd_data(dev, &params);
+		if (err)
+			return ERR_PTR(err);
+
+		params.ucmd = kzalloc(params.ucmd_size, GFP_KERNEL);
+		if (!params.ucmd)
+			return ERR_PTR(-ENOMEM);
+
+		err = ib_copy_from_udata(params.ucmd, udata, params.inlen);
+		if (err)
+			goto free_ucmd;
+	}
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp) {
+		err = -ENOMEM;
+		goto free_ucmd;
+	}
+
+	mutex_init(&qp->mutex);
+	qp->type = type;
+	if (udata) {
+		err = process_vendor_flags(dev, qp, params.ucmd, attr);
+		if (err)
+			goto free_qp;
+
+		err = get_qp_uidx(qp, &params);
+		if (err)
+			goto free_qp;
+	}
+	err = process_create_flags(dev, qp, attr);
+	if (err)
+		goto free_qp;
+
+	err = check_qp_attr(dev, qp, attr);
+	if (err)
+		goto free_qp;
+
+	err = create_qp(dev, pd, qp, &params);
+	if (err)
+		goto free_qp;
+
+	kfree(params.ucmd);
+	params.ucmd = NULL;
+
+	if (udata)
+		/*
+		 * It is safe to copy response for all user create QP flows,
+		 * including MLX5_IB_QPT_DCT, which doesn't need it.
+		 * In that case, resp will be filled with zeros.
+		 */
+		err = ib_copy_to_udata(udata, &params.resp, params.outlen);
+	if (err)
+		goto destroy_qp;
+
+	return &qp->ibqp;
+
+destroy_qp:
+	switch (qp->type) {
+	case MLX5_IB_QPT_DCT:
+		mlx5_ib_destroy_dct(qp);
+		break;
+	case IB_QPT_GSI:
+		mlx5_ib_destroy_gsi(qp);
+		break;
+	default:
+		/*
+		 * These lines below are temp solution till QP allocation
+		 * will be moved to be under IB/core responsiblity.
+		 */
+		qp->ibqp.send_cq = attr->send_cq;
+		qp->ibqp.recv_cq = attr->recv_cq;
+		qp->ibqp.pd = pd;
+		destroy_qp_common(dev, qp, udata);
+	}
+
+	qp = NULL;
+free_qp:
+	kfree(qp);
+free_ucmd:
+	kfree(params.ucmd);
+	return ERR_PTR(err);
+}
+
 int mlx5_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(qp->device);
 	struct mlx5_ib_qp *mqp = to_mqp(qp);
 
 	if (unlikely(qp->qp_type == IB_QPT_GSI))
-		return mlx5_ib_gsi_destroy_qp(qp);
+		return mlx5_ib_destroy_gsi(mqp);
 
-	if (mqp->qp_sub_type == MLX5_IB_QPT_DCT)
+	if (mqp->type == MLX5_IB_QPT_DCT)
 		return mlx5_ib_destroy_dct(mqp);
 
 	destroy_qp_common(dev, mqp, udata);
@@ -2787,14 +3032,13 @@
 	return 0;
 }
 
-static int to_mlx5_access_flags(struct mlx5_ib_qp *qp,
-				const struct ib_qp_attr *attr,
-				int attr_mask, __be32 *hw_access_flags_be)
+static int set_qpc_atomic_flags(struct mlx5_ib_qp *qp,
+				const struct ib_qp_attr *attr, int attr_mask,
+				void *qpc)
 {
-	u8 dest_rd_atomic;
-	u32 access_flags, hw_access_flags = 0;
-
 	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
+	u8 dest_rd_atomic;
+	u32 access_flags;
 
 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
 		dest_rd_atomic = attr->max_dest_rd_atomic;
@@ -2809,8 +3053,8 @@
 	if (!dest_rd_atomic)
 		access_flags &= IB_ACCESS_REMOTE_WRITE;
 
-	if (access_flags & IB_ACCESS_REMOTE_READ)
-		hw_access_flags |= MLX5_QP_BIT_RRE;
+	MLX5_SET(qpc, qpc, rre, !!(access_flags & IB_ACCESS_REMOTE_READ));
+
 	if (access_flags & IB_ACCESS_REMOTE_ATOMIC) {
 		int atomic_mode;
 
@@ -2818,15 +3062,11 @@
 		if (atomic_mode < 0)
 			return -EOPNOTSUPP;
 
-		hw_access_flags |= MLX5_QP_BIT_RAE;
-		hw_access_flags |= atomic_mode << MLX5_ATOMIC_MODE_OFFSET;
+		MLX5_SET(qpc, qpc, rae, 1);
+		MLX5_SET(qpc, qpc, atomic_mode, atomic_mode);
 	}
 
-	if (access_flags & IB_ACCESS_REMOTE_WRITE)
-		hw_access_flags |= MLX5_QP_BIT_RWE;
-
-	*hw_access_flags_be = cpu_to_be32(hw_access_flags);
-
+	MLX5_SET(qpc, qpc, rwe, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
 	return 0;
 }
 
@@ -2836,20 +3076,57 @@
 	MLX5_PATH_FLAG_COUNTER	= 1 << 2,
 };
 
+static int mlx5_to_ib_rate_map(u8 rate)
+{
+	static const int rates[] = { IB_RATE_PORT_CURRENT, IB_RATE_56_GBPS,
+				     IB_RATE_25_GBPS,	   IB_RATE_100_GBPS,
+				     IB_RATE_200_GBPS,	   IB_RATE_50_GBPS,
+				     IB_RATE_400_GBPS };
+
+	if (rate < ARRAY_SIZE(rates))
+		return rates[rate];
+
+	return rate - MLX5_STAT_RATE_OFFSET;
+}
+
+static int ib_to_mlx5_rate_map(u8 rate)
+{
+	switch (rate) {
+	case IB_RATE_PORT_CURRENT:
+		return 0;
+	case IB_RATE_56_GBPS:
+		return 1;
+	case IB_RATE_25_GBPS:
+		return 2;
+	case IB_RATE_100_GBPS:
+		return 3;
+	case IB_RATE_200_GBPS:
+		return 4;
+	case IB_RATE_50_GBPS:
+		return 5;
+	default:
+		return rate + MLX5_STAT_RATE_OFFSET;
+	};
+
+	return 0;
+}
+
 static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
 {
+	u32 stat_rate_support;
+
 	if (rate == IB_RATE_PORT_CURRENT)
 		return 0;
 
 	if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_600_GBPS)
 		return -EINVAL;
 
+	stat_rate_support = MLX5_CAP_GEN(dev->mdev, stat_rate_support);
 	while (rate != IB_RATE_PORT_CURRENT &&
-	       !(1 << (rate + MLX5_STAT_RATE_OFFSET) &
-		 MLX5_CAP_GEN(dev->mdev, stat_rate_support)))
+	       !(1 << ib_to_mlx5_rate_map(rate) & stat_rate_support))
 		--rate;
 
-	return rate ? rate + MLX5_STAT_RATE_OFFSET : rate;
+	return ib_to_mlx5_rate_map(rate);
 }
 
 static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev,
@@ -2872,7 +3149,7 @@
 	tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx);
 	MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1));
 
-	err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen);
+	err = mlx5_core_modify_tis(dev, sq->tisn, in);
 
 	kvfree(in);
 
@@ -2899,18 +3176,29 @@
 	tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx);
 	MLX5_SET(tisc, tisc, lag_tx_port_affinity, tx_affinity);
 
-	err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen);
+	err = mlx5_core_modify_tis(dev, sq->tisn, in);
 
 	kvfree(in);
 
 	return err;
 }
 
+static void mlx5_set_path_udp_sport(void *path, const struct rdma_ah_attr *ah,
+				    u32 lqpn, u32 rqpn)
+
+{
+	u32 fl = ah->grh.flow_label;
+
+	if (!fl)
+		fl = rdma_calc_flow_label(lqpn, rqpn);
+
+	MLX5_SET(ads, path, udp_sport, rdma_flow_label_to_udp_sport(fl));
+}
+
 static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
-			 const struct rdma_ah_attr *ah,
-			 struct mlx5_qp_path *path, u8 port, int attr_mask,
-			 u32 path_flags, const struct ib_qp_attr *attr,
-			 bool alt)
+			 const struct rdma_ah_attr *ah, void *path, u8 port,
+			 int attr_mask, u32 path_flags,
+			 const struct ib_qp_attr *attr, bool alt)
 {
 	const struct ib_global_route *grh = rdma_ah_read_grh(ah);
 	int err;
@@ -2919,8 +3207,8 @@
 	u8 sl = rdma_ah_get_sl(ah);
 
 	if (attr_mask & IB_QP_PKEY_INDEX)
-		path->pkey_index = cpu_to_be16(alt ? attr->alt_pkey_index :
-						     attr->pkey_index);
+		MLX5_SET(ads, path, pkey_index,
+			 alt ? attr->alt_pkey_index : attr->pkey_index);
 
 	if (ah_flags & IB_AH_GRH) {
 		if (grh->sgid_index >=
@@ -2936,45 +3224,49 @@
 		if (!(ah_flags & IB_AH_GRH))
 			return -EINVAL;
 
-		memcpy(path->rmac, ah->roce.dmac, sizeof(ah->roce.dmac));
-		if (qp->ibqp.qp_type == IB_QPT_RC ||
-		    qp->ibqp.qp_type == IB_QPT_UC ||
-		    qp->ibqp.qp_type == IB_QPT_XRC_INI ||
-		    qp->ibqp.qp_type == IB_QPT_XRC_TGT)
-			path->udp_sport =
-				mlx5_get_roce_udp_sport(dev, ah->grh.sgid_attr);
-		path->dci_cfi_prio_sl = (sl & 0x7) << 4;
+		ether_addr_copy(MLX5_ADDR_OF(ads, path, rmac_47_32),
+				ah->roce.dmac);
+		if ((qp->ibqp.qp_type == IB_QPT_RC ||
+		     qp->ibqp.qp_type == IB_QPT_UC ||
+		     qp->ibqp.qp_type == IB_QPT_XRC_INI ||
+		     qp->ibqp.qp_type == IB_QPT_XRC_TGT) &&
+		    (grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) &&
+		    (attr_mask & IB_QP_DEST_QPN))
+			mlx5_set_path_udp_sport(path, ah,
+						qp->ibqp.qp_num,
+						attr->dest_qp_num);
+		MLX5_SET(ads, path, eth_prio, sl & 0x7);
 		gid_type = ah->grh.sgid_attr->gid_type;
 		if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
-			path->ecn_dscp = (grh->traffic_class >> 2) & 0x3f;
+			MLX5_SET(ads, path, dscp, grh->traffic_class >> 2);
 	} else {
-		path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
-		path->fl_free_ar |=
-			(path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0;
-		path->rlid = cpu_to_be16(rdma_ah_get_dlid(ah));
-		path->grh_mlid = rdma_ah_get_path_bits(ah) & 0x7f;
-		if (ah_flags & IB_AH_GRH)
-			path->grh_mlid	|= 1 << 7;
-		path->dci_cfi_prio_sl = sl & 0xf;
+		MLX5_SET(ads, path, fl, !!(path_flags & MLX5_PATH_FLAG_FL));
+		MLX5_SET(ads, path, free_ar,
+			 !!(path_flags & MLX5_PATH_FLAG_FREE_AR));
+		MLX5_SET(ads, path, rlid, rdma_ah_get_dlid(ah));
+		MLX5_SET(ads, path, mlid, rdma_ah_get_path_bits(ah));
+		MLX5_SET(ads, path, grh, !!(ah_flags & IB_AH_GRH));
+		MLX5_SET(ads, path, sl, sl);
 	}
 
 	if (ah_flags & IB_AH_GRH) {
-		path->mgid_index = grh->sgid_index;
-		path->hop_limit  = grh->hop_limit;
-		path->tclass_flowlabel =
-			cpu_to_be32((grh->traffic_class << 20) |
-				    (grh->flow_label));
-		memcpy(path->rgid, grh->dgid.raw, 16);
+		MLX5_SET(ads, path, src_addr_index, grh->sgid_index);
+		MLX5_SET(ads, path, hop_limit, grh->hop_limit);
+		MLX5_SET(ads, path, tclass, grh->traffic_class);
+		MLX5_SET(ads, path, flow_label, grh->flow_label);
+		memcpy(MLX5_ADDR_OF(ads, path, rgid_rip), grh->dgid.raw,
+		       sizeof(grh->dgid.raw));
 	}
 
 	err = ib_rate_to_mlx5(dev, rdma_ah_get_static_rate(ah));
 	if (err < 0)
 		return err;
-	path->static_rate = err;
-	path->port = port;
+	MLX5_SET(ads, path, stat_rate, err);
+	MLX5_SET(ads, path, vhca_port_num, port);
 
 	if (attr_mask & IB_QP_TIMEOUT)
-		path->ackto_lt = (alt ? attr->alt_timeout : attr->timeout) << 3;
+		MLX5_SET(ads, path, ack_timeout,
+			 alt ? attr->alt_timeout : attr->timeout);
 
 	if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt)
 		return modify_raw_packet_eth_prio(dev->mdev,
@@ -2991,10 +3283,12 @@
 					  MLX5_QP_OPTPAR_RAE		|
 					  MLX5_QP_OPTPAR_RWE		|
 					  MLX5_QP_OPTPAR_PKEY_INDEX	|
-					  MLX5_QP_OPTPAR_PRI_PORT,
+					  MLX5_QP_OPTPAR_PRI_PORT	|
+					  MLX5_QP_OPTPAR_LAG_TX_AFF,
 			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE		|
 					  MLX5_QP_OPTPAR_PKEY_INDEX	|
-					  MLX5_QP_OPTPAR_PRI_PORT,
+					  MLX5_QP_OPTPAR_PRI_PORT	|
+					  MLX5_QP_OPTPAR_LAG_TX_AFF,
 			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX	|
 					  MLX5_QP_OPTPAR_Q_KEY		|
 					  MLX5_QP_OPTPAR_PRI_PORT,
@@ -3002,17 +3296,20 @@
 					  MLX5_QP_OPTPAR_RAE		|
 					  MLX5_QP_OPTPAR_RWE		|
 					  MLX5_QP_OPTPAR_PKEY_INDEX	|
-					  MLX5_QP_OPTPAR_PRI_PORT,
+					  MLX5_QP_OPTPAR_PRI_PORT	|
+					  MLX5_QP_OPTPAR_LAG_TX_AFF,
 		},
 		[MLX5_QP_STATE_RTR] = {
 			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
 					  MLX5_QP_OPTPAR_RRE            |
 					  MLX5_QP_OPTPAR_RAE            |
 					  MLX5_QP_OPTPAR_RWE            |
-					  MLX5_QP_OPTPAR_PKEY_INDEX,
+					  MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_LAG_TX_AFF,
 			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
 					  MLX5_QP_OPTPAR_RWE            |
-					  MLX5_QP_OPTPAR_PKEY_INDEX,
+					  MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_LAG_TX_AFF,
 			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX     |
 					  MLX5_QP_OPTPAR_Q_KEY,
 			[MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX	|
@@ -3021,7 +3318,8 @@
 					  MLX5_QP_OPTPAR_RRE            |
 					  MLX5_QP_OPTPAR_RAE            |
 					  MLX5_QP_OPTPAR_RWE            |
-					  MLX5_QP_OPTPAR_PKEY_INDEX,
+					  MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_LAG_TX_AFF,
 		},
 	},
 	[MLX5_QP_STATE_RTR] = {
@@ -3179,7 +3477,7 @@
 				"RAW PACKET QP counters are not supported on current FW\n");
 	}
 
-	err = mlx5_core_modify_rq(dev->mdev, rq->base.mqp.qpn, in, inlen);
+	err = mlx5_core_modify_rq(dev->mdev, rq->base.mqp.qpn, in);
 	if (err)
 		goto out;
 
@@ -3242,7 +3540,7 @@
 		MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
 	}
 
-	err = mlx5_core_modify_sq(dev, sq->base.mqp.qpn, in, inlen);
+	err = mlx5_core_modify_sq(dev, sq->base.mqp.qpn, in);
 	if (err) {
 		/* Remove new rate from table if failed */
 		if (new_rate_added)
@@ -3282,7 +3580,7 @@
 	switch (raw_qp_param->operation) {
 	case MLX5_CMD_OP_RST2INIT_QP:
 		rq_state = MLX5_RQC_STATE_RDY;
-		sq_state = MLX5_SQC_STATE_RDY;
+		sq_state = MLX5_SQC_STATE_RST;
 		break;
 	case MLX5_CMD_OP_2ERR_QP:
 		rq_state = MLX5_RQC_STATE_ERR;
@@ -3294,13 +3592,11 @@
 		break;
 	case MLX5_CMD_OP_RTR2RTS_QP:
 	case MLX5_CMD_OP_RTS2RTS_QP:
-		if (raw_qp_param->set_mask ==
-		    MLX5_RAW_QP_RATE_LIMIT) {
-			modify_rq = 0;
-			sq_state = sq->state;
-		} else {
-			return raw_qp_param->set_mask ? -EINVAL : 0;
-		}
+		if (raw_qp_param->set_mask & ~MLX5_RAW_QP_RATE_LIMIT)
+			return -EINVAL;
+
+		modify_rq = 0;
+		sq_state = MLX5_SQC_STATE_RDY;
 		break;
 	case MLX5_CMD_OP_INIT2INIT_QP:
 	case MLX5_CMD_OP_INIT2RTR_QP:
@@ -3355,43 +3651,78 @@
 	return 0;
 }
 
-static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
-				    struct mlx5_ib_pd *pd,
-				    struct mlx5_ib_qp_base *qp_base,
-				    u8 port_num, struct ib_udata *udata)
+static unsigned int get_tx_affinity_rr(struct mlx5_ib_dev *dev,
+				       struct ib_udata *udata)
 {
 	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
 		udata, struct mlx5_ib_ucontext, ibucontext);
-	unsigned int tx_port_affinity;
+	u8 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
+	atomic_t *tx_port_affinity;
 
-	if (ucontext) {
-		tx_port_affinity = (unsigned int)atomic_add_return(
-					   1, &ucontext->tx_port_affinity) %
-					   MLX5_MAX_PORTS +
-				   1;
+	if (ucontext)
+		tx_port_affinity = &ucontext->tx_port_affinity;
+	else
+		tx_port_affinity = &dev->port[port_num].roce.tx_port_affinity;
+
+	return (unsigned int)atomic_add_return(1, tx_port_affinity) %
+		MLX5_MAX_PORTS + 1;
+}
+
+static bool qp_supports_affinity(struct mlx5_ib_qp *qp)
+{
+	if ((qp->type == IB_QPT_RC) || (qp->type == IB_QPT_UD) ||
+	    (qp->type == IB_QPT_UC) || (qp->type == IB_QPT_RAW_PACKET) ||
+	    (qp->type == IB_QPT_XRC_INI) || (qp->type == IB_QPT_XRC_TGT) ||
+	    (qp->type == MLX5_IB_QPT_DCI))
+		return true;
+	return false;
+}
+
+static unsigned int get_tx_affinity(struct ib_qp *qp,
+				    const struct ib_qp_attr *attr,
+				    int attr_mask, u8 init,
+				    struct ib_udata *udata)
+{
+	struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+		udata, struct mlx5_ib_ucontext, ibucontext);
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+	struct mlx5_ib_qp_base *qp_base;
+	unsigned int tx_affinity;
+
+	if (!(mlx5_ib_lag_should_assign_affinity(dev) &&
+	      qp_supports_affinity(mqp)))
+		return 0;
+
+	if (mqp->flags & MLX5_IB_QP_CREATE_SQPN_QP1)
+		tx_affinity = mqp->gsi_lag_port;
+	else if (init)
+		tx_affinity = get_tx_affinity_rr(dev, udata);
+	else if ((attr_mask & IB_QP_AV) && attr->xmit_slave)
+		tx_affinity =
+			mlx5_lag_get_slave_port(dev->mdev, attr->xmit_slave);
+	else
+		return 0;
+
+	qp_base = &mqp->trans_qp.base;
+	if (ucontext)
 		mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x ucontext %p\n",
-				tx_port_affinity, qp_base->mqp.qpn, ucontext);
-	} else {
-		tx_port_affinity =
-			(unsigned int)atomic_add_return(
-				1, &dev->port[port_num].roce.tx_port_affinity) %
-				MLX5_MAX_PORTS +
-			1;
+			    tx_affinity, qp_base->mqp.qpn, ucontext);
+	else
 		mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x\n",
-				tx_port_affinity, qp_base->mqp.qpn);
-	}
-
-	return tx_port_affinity;
+			    tx_affinity, qp_base->mqp.qpn);
+	return tx_affinity;
 }
 
 static int __mlx5_ib_qp_set_counter(struct ib_qp *qp,
 				    struct rdma_counter *counter)
 {
 	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	u32 in[MLX5_ST_SZ_DW(rts2rts_qp_in)] = {};
 	struct mlx5_ib_qp *mqp = to_mqp(qp);
-	struct mlx5_qp_context context = {};
 	struct mlx5_ib_qp_base *base;
 	u32 set_id;
+	u32 *qpc;
 
 	if (counter)
 		set_id = counter->id;
@@ -3399,12 +3730,15 @@
 		set_id = mlx5_ib_get_counters_id(dev, mqp->port - 1);
 
 	base = &mqp->trans_qp.base;
-	context.qp_counter_set_usr_page &= cpu_to_be32(0xffffff);
-	context.qp_counter_set_usr_page |= cpu_to_be32(set_id << 24);
-	return mlx5_core_qp_modify(dev->mdev,
-				   MLX5_CMD_OP_RTS2RTS_QP,
-				   MLX5_QP_OPTPAR_COUNTER_SET_ID,
-				   &context, &base->mqp);
+	MLX5_SET(rts2rts_qp_in, in, opcode, MLX5_CMD_OP_RTS2RTS_QP);
+	MLX5_SET(rts2rts_qp_in, in, qpn, base->mqp.qpn);
+	MLX5_SET(rts2rts_qp_in, in, uid, base->mqp.uid);
+	MLX5_SET(rts2rts_qp_in, in, opt_param_mask,
+		 MLX5_QP_OPTPAR_COUNTER_SET_ID);
+
+	qpc = MLX5_ADDR_OF(rts2rts_qp_in, in, qpc);
+	MLX5_SET(qpc, qpc, counter_set_id, set_id);
+	return mlx5_cmd_exec_in(dev->mdev, rts2rts_qp, in);
 }
 
 static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
@@ -3412,6 +3746,7 @@
 			       enum ib_qp_state cur_state,
 			       enum ib_qp_state new_state,
 			       const struct mlx5_ib_modify_qp *ucmd,
+			       struct mlx5_ib_modify_qp_resp *resp,
 			       struct ib_udata *udata)
 {
 	static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
@@ -3455,67 +3790,60 @@
 	struct mlx5_ib_qp *qp = to_mqp(ibqp);
 	struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
 	struct mlx5_ib_cq *send_cq, *recv_cq;
-	struct mlx5_qp_context *context;
 	struct mlx5_ib_pd *pd;
 	enum mlx5_qp_state mlx5_cur, mlx5_new;
-	enum mlx5_qp_optpar optpar;
+	void *qpc, *pri_path, *alt_path;
+	enum mlx5_qp_optpar optpar = 0;
 	u32 set_id = 0;
 	int mlx5_st;
 	int err;
 	u16 op;
 	u8 tx_affinity = 0;
 
-	mlx5_st = to_mlx5_st(ibqp->qp_type == IB_QPT_DRIVER ?
-			     qp->qp_sub_type : ibqp->qp_type);
+	mlx5_st = to_mlx5_st(qp->type);
 	if (mlx5_st < 0)
 		return -EINVAL;
 
-	context = kzalloc(sizeof(*context), GFP_KERNEL);
-	if (!context)
+	qpc = kzalloc(MLX5_ST_SZ_BYTES(qpc), GFP_KERNEL);
+	if (!qpc)
 		return -ENOMEM;
 
-	pd = get_pd(qp);
-	context->flags = cpu_to_be32(mlx5_st << 16);
+	pd = to_mpd(qp->ibqp.pd);
+	MLX5_SET(qpc, qpc, st, mlx5_st);
 
 	if (!(attr_mask & IB_QP_PATH_MIG_STATE)) {
-		context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
+		MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
 	} else {
 		switch (attr->path_mig_state) {
 		case IB_MIG_MIGRATED:
-			context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
+			MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
 			break;
 		case IB_MIG_REARM:
-			context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11);
+			MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_REARM);
 			break;
 		case IB_MIG_ARMED:
-			context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11);
+			MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_ARMED);
 			break;
 		}
 	}
 
-	if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) {
-		if ((ibqp->qp_type == IB_QPT_RC) ||
-		    (ibqp->qp_type == IB_QPT_UD &&
-		     !(qp->flags & MLX5_IB_QP_SQPN_QP1)) ||
-		    (ibqp->qp_type == IB_QPT_UC) ||
-		    (ibqp->qp_type == IB_QPT_RAW_PACKET) ||
-		    (ibqp->qp_type == IB_QPT_XRC_INI) ||
-		    (ibqp->qp_type == IB_QPT_XRC_TGT)) {
-			if (dev->lag_active) {
-				u8 p = mlx5_core_native_port_num(dev->mdev) - 1;
-				tx_affinity = get_tx_affinity(dev, pd, base, p,
-							      udata);
-				context->flags |= cpu_to_be32(tx_affinity << 24);
-			}
-		}
-	}
+	tx_affinity = get_tx_affinity(ibqp, attr, attr_mask,
+				      cur_state == IB_QPS_RESET &&
+				      new_state == IB_QPS_INIT, udata);
+
+	MLX5_SET(qpc, qpc, lag_tx_port_affinity, tx_affinity);
+	if (tx_affinity && new_state == IB_QPS_RTR &&
+	    MLX5_CAP_GEN(dev->mdev, init2_lag_tx_port_affinity))
+		optpar |= MLX5_QP_OPTPAR_LAG_TX_AFF;
 
 	if (is_sqp(ibqp->qp_type)) {
-		context->mtu_msgmax = (IB_MTU_256 << 5) | 8;
+		MLX5_SET(qpc, qpc, mtu, IB_MTU_256);
+		MLX5_SET(qpc, qpc, log_msg_max, 8);
 	} else if ((ibqp->qp_type == IB_QPT_UD &&
-		    !(qp->flags & MLX5_IB_QP_UNDERLAY)) ||
+		    !(qp->flags & IB_QP_CREATE_SOURCE_QPN)) ||
 		   ibqp->qp_type == MLX5_IB_QPT_REG_UMR) {
-		context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
+		MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
+		MLX5_SET(qpc, qpc, log_msg_max, 12);
 	} else if (attr_mask & IB_QP_PATH_MTU) {
 		if (attr->path_mtu < IB_MTU_256 ||
 		    attr->path_mtu > IB_MTU_4096) {
@@ -3523,40 +3851,45 @@
 			err = -EINVAL;
 			goto out;
 		}
-		context->mtu_msgmax = (attr->path_mtu << 5) |
-				      (u8)MLX5_CAP_GEN(dev->mdev, log_max_msg);
+		MLX5_SET(qpc, qpc, mtu, attr->path_mtu);
+		MLX5_SET(qpc, qpc, log_msg_max,
+			 MLX5_CAP_GEN(dev->mdev, log_max_msg));
 	}
 
 	if (attr_mask & IB_QP_DEST_QPN)
-		context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num);
+		MLX5_SET(qpc, qpc, remote_qpn, attr->dest_qp_num);
+
+	pri_path = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
+	alt_path = MLX5_ADDR_OF(qpc, qpc, secondary_address_path);
 
 	if (attr_mask & IB_QP_PKEY_INDEX)
-		context->pri_path.pkey_index = cpu_to_be16(attr->pkey_index);
+		MLX5_SET(ads, pri_path, pkey_index, attr->pkey_index);
 
 	/* todo implement counter_index functionality */
 
 	if (is_sqp(ibqp->qp_type))
-		context->pri_path.port = qp->port;
+		MLX5_SET(ads, pri_path, vhca_port_num, qp->port);
 
 	if (attr_mask & IB_QP_PORT)
-		context->pri_path.port = attr->port_num;
+		MLX5_SET(ads, pri_path, vhca_port_num, attr->port_num);
 
 	if (attr_mask & IB_QP_AV) {
-		err = mlx5_set_path(dev, qp, &attr->ah_attr, &context->pri_path,
-				    attr_mask & IB_QP_PORT ? attr->port_num : qp->port,
+		err = mlx5_set_path(dev, qp, &attr->ah_attr, pri_path,
+				    attr_mask & IB_QP_PORT ? attr->port_num :
+							     qp->port,
 				    attr_mask, 0, attr, false);
 		if (err)
 			goto out;
 	}
 
 	if (attr_mask & IB_QP_TIMEOUT)
-		context->pri_path.ackto_lt |= attr->timeout << 3;
+		MLX5_SET(ads, pri_path, ack_timeout, attr->timeout);
 
 	if (attr_mask & IB_QP_ALT_PATH) {
-		err = mlx5_set_path(dev, qp, &attr->alt_ah_attr,
-				    &context->alt_path,
+		err = mlx5_set_path(dev, qp, &attr->alt_ah_attr, alt_path,
 				    attr->alt_port_num,
-				    attr_mask | IB_QP_PKEY_INDEX | IB_QP_TIMEOUT,
+				    attr_mask | IB_QP_PKEY_INDEX |
+					    IB_QP_TIMEOUT,
 				    0, attr, true);
 		if (err)
 			goto out;
@@ -3565,75 +3898,68 @@
 	get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
 		&send_cq, &recv_cq);
 
-	context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn);
-	context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0;
-	context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0;
-	context->params1  = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28);
+	MLX5_SET(qpc, qpc, pd, pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn);
+	if (send_cq)
+		MLX5_SET(qpc, qpc, cqn_snd, send_cq->mcq.cqn);
+	if (recv_cq)
+		MLX5_SET(qpc, qpc, cqn_rcv, recv_cq->mcq.cqn);
+
+	MLX5_SET(qpc, qpc, log_ack_req_freq, MLX5_IB_ACK_REQ_FREQ);
 
 	if (attr_mask & IB_QP_RNR_RETRY)
-		context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
+		MLX5_SET(qpc, qpc, rnr_retry, attr->rnr_retry);
 
 	if (attr_mask & IB_QP_RETRY_CNT)
-		context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+		MLX5_SET(qpc, qpc, retry_count, attr->retry_cnt);
 
-	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
-		if (attr->max_rd_atomic)
-			context->params1 |=
-				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
-	}
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC && attr->max_rd_atomic)
+		MLX5_SET(qpc, qpc, log_sra_max, ilog2(attr->max_rd_atomic));
 
 	if (attr_mask & IB_QP_SQ_PSN)
-		context->next_send_psn = cpu_to_be32(attr->sq_psn);
+		MLX5_SET(qpc, qpc, next_send_psn, attr->sq_psn);
 
-	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
-		if (attr->max_dest_rd_atomic)
-			context->params2 |=
-				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
-	}
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC && attr->max_dest_rd_atomic)
+		MLX5_SET(qpc, qpc, log_rra_max,
+			 ilog2(attr->max_dest_rd_atomic));
 
 	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
-		__be32 access_flags;
-
-		err = to_mlx5_access_flags(qp, attr, attr_mask, &access_flags);
+		err = set_qpc_atomic_flags(qp, attr, attr_mask, qpc);
 		if (err)
 			goto out;
-
-		context->params2 |= access_flags;
 	}
 
 	if (attr_mask & IB_QP_MIN_RNR_TIMER)
-		context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+		MLX5_SET(qpc, qpc, min_rnr_nak, attr->min_rnr_timer);
 
 	if (attr_mask & IB_QP_RQ_PSN)
-		context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+		MLX5_SET(qpc, qpc, next_rcv_psn, attr->rq_psn);
 
 	if (attr_mask & IB_QP_QKEY)
-		context->qkey = cpu_to_be32(attr->qkey);
+		MLX5_SET(qpc, qpc, q_key, attr->qkey);
 
 	if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
-		context->db_rec_addr = cpu_to_be64(qp->db.dma);
+		MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
 
 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
 		u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num :
 			       qp->port) - 1;
 
 		/* Underlay port should be used - index 0 function per port */
-		if (qp->flags & MLX5_IB_QP_UNDERLAY)
+		if (qp->flags & IB_QP_CREATE_SOURCE_QPN)
 			port_num = 0;
 
 		if (ibqp->counter)
 			set_id = ibqp->counter->id;
 		else
 			set_id = mlx5_ib_get_counters_id(dev, port_num);
-		context->qp_counter_set_usr_page |=
-			cpu_to_be32(set_id << 24);
+		MLX5_SET(qpc, qpc, counter_set_id, set_id);
 	}
 
 	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
-		context->sq_crq_size |= cpu_to_be16(1 << 4);
+		MLX5_SET(qpc, qpc, rlky, 1);
 
-	if (qp->flags & MLX5_IB_QP_SQPN_QP1)
-		context->deth_sqpn = cpu_to_be32(1);
+	if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1)
+		MLX5_SET(qpc, qpc, deth_sqpn, 1);
 
 	mlx5_cur = to_mlx5_state(cur_state);
 	mlx5_new = to_mlx5_state(new_state);
@@ -3645,11 +3971,11 @@
 	}
 
 	op = optab[mlx5_cur][mlx5_new];
-	optpar = ib_mask_to_mlx5_opt(attr_mask);
+	optpar |= ib_mask_to_mlx5_opt(attr_mask);
 	optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
 
 	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET ||
-	    qp->flags & MLX5_IB_QP_UNDERLAY) {
+	    qp->flags & IB_QP_CREATE_SOURCE_QPN) {
 		struct mlx5_modify_raw_qp_param raw_qp_param = {};
 
 		raw_qp_param.operation = op;
@@ -3691,8 +4017,15 @@
 
 		err = modify_raw_packet_qp(dev, qp, &raw_qp_param, tx_affinity);
 	} else {
-		err = mlx5_core_qp_modify(dev->mdev, op, optpar, context,
-					  &base->mqp);
+		if (udata) {
+			/* For the kernel flows, the resp will stay zero */
+			resp->ece_options =
+				MLX5_CAP_GEN(dev->mdev, ece_support) ?
+					ucmd->ece_options : 0;
+			resp->response_length = sizeof(*resp);
+		}
+		err = mlx5_core_qp_modify(dev, op, optpar, qpc, &base->mqp,
+					  &resp->ece_options);
 	}
 
 	if (err)
@@ -3739,7 +4072,7 @@
 	}
 
 out:
-	kfree(context);
+	kfree(qpc);
 	return err;
 }
 
@@ -3797,14 +4130,15 @@
  * Other transitions and attributes are illegal
  */
 static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
-			      int attr_mask, struct ib_udata *udata)
+			      int attr_mask, struct mlx5_ib_modify_qp *ucmd,
+			      struct ib_udata *udata)
 {
 	struct mlx5_ib_qp *qp = to_mqp(ibqp);
 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
 	enum ib_qp_state cur_state, new_state;
-	int err = 0;
 	int required = IB_QP_STATE;
 	void *dctc;
+	int err;
 
 	if (!(attr_mask & IB_QP_STATE))
 		return -EINVAL;
@@ -3813,6 +4147,15 @@
 	new_state = attr->qp_state;
 
 	dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry);
+	if (MLX5_CAP_GEN(dev->mdev, ece_support) && ucmd->ece_options)
+		/*
+		 * DCT doesn't initialize QP till modify command is executed,
+		 * so we need to overwrite previously set ECE field if user
+		 * provided any value except zero, which means not set/not
+		 * valid.
+		 */
+		MLX5_SET(dctc, dctc, ece, ucmd->ece_options);
+
 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
 		u16 set_id;
 
@@ -3841,20 +4184,28 @@
 			MLX5_SET(dctc, dctc, rae, 1);
 		}
 		MLX5_SET(dctc, dctc, pkey_index, attr->pkey_index);
-		MLX5_SET(dctc, dctc, port, attr->port_num);
+		if (mlx5_lag_is_active(dev->mdev))
+			MLX5_SET(dctc, dctc, port,
+				 get_tx_affinity_rr(dev, udata));
+		else
+			MLX5_SET(dctc, dctc, port, attr->port_num);
 
 		set_id = mlx5_ib_get_counters_id(dev, attr->port_num - 1);
 		MLX5_SET(dctc, dctc, counter_set_id, set_id);
-
 	} else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
 		struct mlx5_ib_modify_qp_resp resp = {};
-		u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0};
-		u32 min_resp_len = offsetof(typeof(resp), dctn) +
-				   sizeof(resp.dctn);
+		u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {};
+		u32 min_resp_len = offsetofend(typeof(resp), dctn);
 
 		if (udata->outlen < min_resp_len)
 			return -EINVAL;
-		resp.response_length = min_resp_len;
+		/*
+		 * If we don't have enough space for the ECE options,
+		 * simply indicate it with resp.response_length.
+		 */
+		resp.response_length = (udata->outlen < sizeof(resp)) ?
+					       min_resp_len :
+					       sizeof(resp);
 
 		required |= IB_QP_MIN_RNR_TIMER | IB_QP_AV | IB_QP_PATH_MTU;
 		if (!is_valid_mask(attr_mask, required, 0))
@@ -3865,48 +4216,68 @@
 		MLX5_SET(dctc, dctc, mtu, attr->path_mtu);
 		MLX5_SET(dctc, dctc, my_addr_index, attr->ah_attr.grh.sgid_index);
 		MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit);
+		if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE)
+			MLX5_SET(dctc, dctc, eth_prio, attr->ah_attr.sl & 0x7);
 
-		err = mlx5_core_create_dct(dev->mdev, &qp->dct.mdct, qp->dct.in,
+		err = mlx5_core_create_dct(dev, &qp->dct.mdct, qp->dct.in,
 					   MLX5_ST_SZ_BYTES(create_dct_in), out,
 					   sizeof(out));
 		if (err)
 			return err;
 		resp.dctn = qp->dct.mdct.mqp.qpn;
+		if (MLX5_CAP_GEN(dev->mdev, ece_support))
+			resp.ece_options = MLX5_GET(create_dct_out, out, ece);
 		err = ib_copy_to_udata(udata, &resp, resp.response_length);
 		if (err) {
-			mlx5_core_destroy_dct(dev->mdev, &qp->dct.mdct);
+			mlx5_core_destroy_dct(dev, &qp->dct.mdct);
 			return err;
 		}
 	} else {
 		mlx5_ib_warn(dev, "Modify DCT: Invalid transition from %d to %d\n", cur_state, new_state);
 		return -EINVAL;
 	}
-	if (err)
-		qp->state = IB_QPS_ERR;
-	else
-		qp->state = new_state;
-	return err;
+
+	qp->state = new_state;
+	return 0;
+}
+
+static bool mlx5_ib_modify_qp_allowed(struct mlx5_ib_dev *dev,
+				      struct mlx5_ib_qp *qp,
+				      enum ib_qp_type qp_type)
+{
+	if (dev->profile != &raw_eth_profile)
+		return true;
+
+	if (qp_type == IB_QPT_RAW_PACKET || qp_type == MLX5_IB_QPT_REG_UMR)
+		return true;
+
+	/* Internal QP used for wc testing, with NOPs in wq */
+	if (qp->flags & MLX5_IB_QP_CREATE_WC_TEST)
+		return true;
+
+	return false;
 }
 
 int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		      int attr_mask, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_modify_qp_resp resp = {};
 	struct mlx5_ib_qp *qp = to_mqp(ibqp);
 	struct mlx5_ib_modify_qp ucmd = {};
 	enum ib_qp_type qp_type;
 	enum ib_qp_state cur_state, new_state;
-	size_t required_cmd_sz;
 	int err = -EINVAL;
 	int port;
 
+	if (!mlx5_ib_modify_qp_allowed(dev, qp, ibqp->qp_type))
+		return -EOPNOTSUPP;
+
 	if (ibqp->rwq_ind_tbl)
 		return -ENOSYS;
 
 	if (udata && udata->inlen) {
-		required_cmd_sz = offsetof(typeof(ucmd), reserved) +
-			sizeof(ucmd.reserved);
-		if (udata->inlen < required_cmd_sz)
+		if (udata->inlen < offsetofend(typeof(ucmd), ece_options))
 			return -EINVAL;
 
 		if (udata->inlen > sizeof(ucmd) &&
@@ -3919,23 +4290,20 @@
 			return -EFAULT;
 
 		if (ucmd.comp_mask ||
-		    memchr_inv(&ucmd.reserved, 0, sizeof(ucmd.reserved)) ||
 		    memchr_inv(&ucmd.burst_info.reserved, 0,
 			       sizeof(ucmd.burst_info.reserved)))
 			return -EOPNOTSUPP;
+
 	}
 
 	if (unlikely(ibqp->qp_type == IB_QPT_GSI))
 		return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask);
 
-	if (ibqp->qp_type == IB_QPT_DRIVER)
-		qp_type = qp->qp_sub_type;
-	else
-		qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ?
-			IB_QPT_GSI : ibqp->qp_type;
+	qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? IB_QPT_GSI :
+								    qp->type;
 
 	if (qp_type == MLX5_IB_QPT_DCT)
-		return mlx5_ib_modify_dct(ibqp, attr, attr_mask, udata);
+		return mlx5_ib_modify_dct(ibqp, attr, attr_mask, &ucmd, udata);
 
 	mutex_lock(&qp->mutex);
 
@@ -3946,7 +4314,7 @@
 		port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
 	}
 
-	if (qp->flags & MLX5_IB_QP_UNDERLAY) {
+	if (qp->flags & IB_QP_CREATE_SOURCE_QPN) {
 		if (attr_mask & ~(IB_QP_STATE | IB_QP_CUR_STATE)) {
 			mlx5_ib_dbg(dev, "invalid attr_mask 0x%x when underlay QP is used\n",
 				    attr_mask);
@@ -4006,1440 +4374,19 @@
 	}
 
 	err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state,
-				  new_state, &ucmd, udata);
+				  new_state, &ucmd, &resp, udata);
+
+	/* resp.response_length is set in ECE supported flows only */
+	if (!err && resp.response_length &&
+	    udata->outlen >= resp.response_length)
+		/* Return -EFAULT to the user and expect him to destroy QP. */
+		err = ib_copy_to_udata(udata, &resp, resp.response_length);
 
 out:
 	mutex_unlock(&qp->mutex);
 	return err;
 }
 
-static void _handle_post_send_edge(struct mlx5_ib_wq *sq, void **seg,
-				   u32 wqe_sz, void **cur_edge)
-{
-	u32 idx;
-
-	idx = (sq->cur_post + (wqe_sz >> 2)) & (sq->wqe_cnt - 1);
-	*cur_edge = get_sq_edge(sq, idx);
-
-	*seg = mlx5_frag_buf_get_wqe(&sq->fbc, idx);
-}
-
-/* handle_post_send_edge - Check if we get to SQ edge. If yes, update to the
- * next nearby edge and get new address translation for current WQE position.
- * @sq - SQ buffer.
- * @seg: Current WQE position (16B aligned).
- * @wqe_sz: Total current WQE size [16B].
- * @cur_edge: Updated current edge.
- */
-static inline void handle_post_send_edge(struct mlx5_ib_wq *sq, void **seg,
-					 u32 wqe_sz, void **cur_edge)
-{
-	if (likely(*seg != *cur_edge))
-		return;
-
-	_handle_post_send_edge(sq, seg, wqe_sz, cur_edge);
-}
-
-/* memcpy_send_wqe - copy data from src to WQE and update the relevant WQ's
- * pointers. At the end @seg is aligned to 16B regardless the copied size.
- * @sq - SQ buffer.
- * @cur_edge: Updated current edge.
- * @seg: Current WQE position (16B aligned).
- * @wqe_sz: Total current WQE size [16B].
- * @src: Pointer to copy from.
- * @n: Number of bytes to copy.
- */
-static inline void memcpy_send_wqe(struct mlx5_ib_wq *sq, void **cur_edge,
-				   void **seg, u32 *wqe_sz, const void *src,
-				   size_t n)
-{
-	while (likely(n)) {
-		size_t leftlen = *cur_edge - *seg;
-		size_t copysz = min_t(size_t, leftlen, n);
-		size_t stride;
-
-		memcpy(*seg, src, copysz);
-
-		n -= copysz;
-		src += copysz;
-		stride = !n ? ALIGN(copysz, 16) : copysz;
-		*seg += stride;
-		*wqe_sz += stride >> 4;
-		handle_post_send_edge(sq, seg, *wqe_sz, cur_edge);
-	}
-}
-
-static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
-{
-	struct mlx5_ib_cq *cq;
-	unsigned cur;
-
-	cur = wq->head - wq->tail;
-	if (likely(cur + nreq < wq->max_post))
-		return 0;
-
-	cq = to_mcq(ib_cq);
-	spin_lock(&cq->lock);
-	cur = wq->head - wq->tail;
-	spin_unlock(&cq->lock);
-
-	return cur + nreq >= wq->max_post;
-}
-
-static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg,
-					  u64 remote_addr, u32 rkey)
-{
-	rseg->raddr    = cpu_to_be64(remote_addr);
-	rseg->rkey     = cpu_to_be32(rkey);
-	rseg->reserved = 0;
-}
-
-static void set_eth_seg(const struct ib_send_wr *wr, struct mlx5_ib_qp *qp,
-			void **seg, int *size, void **cur_edge)
-{
-	struct mlx5_wqe_eth_seg *eseg = *seg;
-
-	memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg));
-
-	if (wr->send_flags & IB_SEND_IP_CSUM)
-		eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM |
-				 MLX5_ETH_WQE_L4_CSUM;
-
-	if (wr->opcode == IB_WR_LSO) {
-		struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr);
-		size_t left, copysz;
-		void *pdata = ud_wr->header;
-		size_t stride;
-
-		left = ud_wr->hlen;
-		eseg->mss = cpu_to_be16(ud_wr->mss);
-		eseg->inline_hdr.sz = cpu_to_be16(left);
-
-		/* memcpy_send_wqe should get a 16B align address. Hence, we
-		 * first copy up to the current edge and then, if needed,
-		 * fall-through to memcpy_send_wqe.
-		 */
-		copysz = min_t(u64, *cur_edge - (void *)eseg->inline_hdr.start,
-			       left);
-		memcpy(eseg->inline_hdr.start, pdata, copysz);
-		stride = ALIGN(sizeof(struct mlx5_wqe_eth_seg) -
-			       sizeof(eseg->inline_hdr.start) + copysz, 16);
-		*size += stride / 16;
-		*seg += stride;
-
-		if (copysz < left) {
-			handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
-			left -= copysz;
-			pdata += copysz;
-			memcpy_send_wqe(&qp->sq, cur_edge, seg, size, pdata,
-					left);
-		}
-
-		return;
-	}
-
-	*seg += sizeof(struct mlx5_wqe_eth_seg);
-	*size += sizeof(struct mlx5_wqe_eth_seg) / 16;
-}
-
-static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg,
-			     const struct ib_send_wr *wr)
-{
-	memcpy(&dseg->av, &to_mah(ud_wr(wr)->ah)->av, sizeof(struct mlx5_av));
-	dseg->av.dqp_dct = cpu_to_be32(ud_wr(wr)->remote_qpn | MLX5_EXTENDED_UD_AV);
-	dseg->av.key.qkey.qkey = cpu_to_be32(ud_wr(wr)->remote_qkey);
-}
-
-static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg)
-{
-	dseg->byte_count = cpu_to_be32(sg->length);
-	dseg->lkey       = cpu_to_be32(sg->lkey);
-	dseg->addr       = cpu_to_be64(sg->addr);
-}
-
-static u64 get_xlt_octo(u64 bytes)
-{
-	return ALIGN(bytes, MLX5_IB_UMR_XLT_ALIGNMENT) /
-	       MLX5_IB_UMR_OCTOWORD;
-}
-
-static __be64 frwr_mkey_mask(bool atomic)
-{
-	u64 result;
-
-	result = MLX5_MKEY_MASK_LEN		|
-		MLX5_MKEY_MASK_PAGE_SIZE	|
-		MLX5_MKEY_MASK_START_ADDR	|
-		MLX5_MKEY_MASK_EN_RINVAL	|
-		MLX5_MKEY_MASK_KEY		|
-		MLX5_MKEY_MASK_LR		|
-		MLX5_MKEY_MASK_LW		|
-		MLX5_MKEY_MASK_RR		|
-		MLX5_MKEY_MASK_RW		|
-		MLX5_MKEY_MASK_SMALL_FENCE	|
-		MLX5_MKEY_MASK_FREE;
-
-	if (atomic)
-		result |= MLX5_MKEY_MASK_A;
-
-	return cpu_to_be64(result);
-}
-
-static __be64 sig_mkey_mask(void)
-{
-	u64 result;
-
-	result = MLX5_MKEY_MASK_LEN		|
-		MLX5_MKEY_MASK_PAGE_SIZE	|
-		MLX5_MKEY_MASK_START_ADDR	|
-		MLX5_MKEY_MASK_EN_SIGERR	|
-		MLX5_MKEY_MASK_EN_RINVAL	|
-		MLX5_MKEY_MASK_KEY		|
-		MLX5_MKEY_MASK_LR		|
-		MLX5_MKEY_MASK_LW		|
-		MLX5_MKEY_MASK_RR		|
-		MLX5_MKEY_MASK_RW		|
-		MLX5_MKEY_MASK_SMALL_FENCE	|
-		MLX5_MKEY_MASK_FREE		|
-		MLX5_MKEY_MASK_BSF_EN;
-
-	return cpu_to_be64(result);
-}
-
-static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr,
-			    struct mlx5_ib_mr *mr, u8 flags, bool atomic)
-{
-	int size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
-
-	memset(umr, 0, sizeof(*umr));
-
-	umr->flags = flags;
-	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size));
-	umr->mkey_mask = frwr_mkey_mask(atomic);
-}
-
-static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr)
-{
-	memset(umr, 0, sizeof(*umr));
-	umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE);
-	umr->flags = MLX5_UMR_INLINE;
-}
-
-static __be64 get_umr_enable_mr_mask(void)
-{
-	u64 result;
-
-	result = MLX5_MKEY_MASK_KEY |
-		 MLX5_MKEY_MASK_FREE;
-
-	return cpu_to_be64(result);
-}
-
-static __be64 get_umr_disable_mr_mask(void)
-{
-	u64 result;
-
-	result = MLX5_MKEY_MASK_FREE;
-
-	return cpu_to_be64(result);
-}
-
-static __be64 get_umr_update_translation_mask(void)
-{
-	u64 result;
-
-	result = MLX5_MKEY_MASK_LEN |
-		 MLX5_MKEY_MASK_PAGE_SIZE |
-		 MLX5_MKEY_MASK_START_ADDR;
-
-	return cpu_to_be64(result);
-}
-
-static __be64 get_umr_update_access_mask(int atomic)
-{
-	u64 result;
-
-	result = MLX5_MKEY_MASK_LR |
-		 MLX5_MKEY_MASK_LW |
-		 MLX5_MKEY_MASK_RR |
-		 MLX5_MKEY_MASK_RW;
-
-	if (atomic)
-		result |= MLX5_MKEY_MASK_A;
-
-	return cpu_to_be64(result);
-}
-
-static __be64 get_umr_update_pd_mask(void)
-{
-	u64 result;
-
-	result = MLX5_MKEY_MASK_PD;
-
-	return cpu_to_be64(result);
-}
-
-static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask)
-{
-	if ((mask & MLX5_MKEY_MASK_PAGE_SIZE &&
-	     MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled)) ||
-	    (mask & MLX5_MKEY_MASK_A &&
-	     MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)))
-		return -EPERM;
-	return 0;
-}
-
-static int set_reg_umr_segment(struct mlx5_ib_dev *dev,
-			       struct mlx5_wqe_umr_ctrl_seg *umr,
-			       const struct ib_send_wr *wr, int atomic)
-{
-	const struct mlx5_umr_wr *umrwr = umr_wr(wr);
-
-	memset(umr, 0, sizeof(*umr));
-
-	if (!umrwr->ignore_free_state) {
-		if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE)
-			 /* fail if free */
-			umr->flags = MLX5_UMR_CHECK_FREE;
-		else
-			/* fail if not free */
-			umr->flags = MLX5_UMR_CHECK_NOT_FREE;
-	}
-
-	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(umrwr->xlt_size));
-	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_XLT) {
-		u64 offset = get_xlt_octo(umrwr->offset);
-
-		umr->xlt_offset = cpu_to_be16(offset & 0xffff);
-		umr->xlt_offset_47_16 = cpu_to_be32(offset >> 16);
-		umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
-	}
-	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION)
-		umr->mkey_mask |= get_umr_update_translation_mask();
-	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS) {
-		umr->mkey_mask |= get_umr_update_access_mask(atomic);
-		umr->mkey_mask |= get_umr_update_pd_mask();
-	}
-	if (wr->send_flags & MLX5_IB_SEND_UMR_ENABLE_MR)
-		umr->mkey_mask |= get_umr_enable_mr_mask();
-	if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR)
-		umr->mkey_mask |= get_umr_disable_mr_mask();
-
-	if (!wr->num_sge)
-		umr->flags |= MLX5_UMR_INLINE;
-
-	return umr_check_mkey_mask(dev, be64_to_cpu(umr->mkey_mask));
-}
-
-static u8 get_umr_flags(int acc)
-{
-	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC       : 0) |
-	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX5_PERM_REMOTE_WRITE : 0) |
-	       (acc & IB_ACCESS_REMOTE_READ   ? MLX5_PERM_REMOTE_READ  : 0) |
-	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX5_PERM_LOCAL_WRITE  : 0) |
-		MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN;
-}
-
-static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg,
-			     struct mlx5_ib_mr *mr,
-			     u32 key, int access)
-{
-	int ndescs = ALIGN(mr->ndescs + mr->meta_ndescs, 8) >> 1;
-
-	memset(seg, 0, sizeof(*seg));
-
-	if (mr->access_mode == MLX5_MKC_ACCESS_MODE_MTT)
-		seg->log2_page_size = ilog2(mr->ibmr.page_size);
-	else if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
-		/* KLMs take twice the size of MTTs */
-		ndescs *= 2;
-
-	seg->flags = get_umr_flags(access) | mr->access_mode;
-	seg->qpn_mkey7_0 = cpu_to_be32((key & 0xff) | 0xffffff00);
-	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL);
-	seg->start_addr = cpu_to_be64(mr->ibmr.iova);
-	seg->len = cpu_to_be64(mr->ibmr.length);
-	seg->xlt_oct_size = cpu_to_be32(ndescs);
-}
-
-static void set_linv_mkey_seg(struct mlx5_mkey_seg *seg)
-{
-	memset(seg, 0, sizeof(*seg));
-	seg->status = MLX5_MKEY_STATUS_FREE;
-}
-
-static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg,
-				 const struct ib_send_wr *wr)
-{
-	const struct mlx5_umr_wr *umrwr = umr_wr(wr);
-
-	memset(seg, 0, sizeof(*seg));
-	if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR)
-		seg->status = MLX5_MKEY_STATUS_FREE;
-
-	seg->flags = convert_access(umrwr->access_flags);
-	if (umrwr->pd)
-		seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn);
-	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION &&
-	    !umrwr->length)
-		seg->flags_pd |= cpu_to_be32(MLX5_MKEY_LEN64);
-
-	seg->start_addr = cpu_to_be64(umrwr->virt_addr);
-	seg->len = cpu_to_be64(umrwr->length);
-	seg->log2_page_size = umrwr->page_shift;
-	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 |
-				       mlx5_mkey_variant(umrwr->mkey));
-}
-
-static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg,
-			     struct mlx5_ib_mr *mr,
-			     struct mlx5_ib_pd *pd)
-{
-	int bcount = mr->desc_size * (mr->ndescs + mr->meta_ndescs);
-
-	dseg->addr = cpu_to_be64(mr->desc_map);
-	dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64));
-	dseg->lkey = cpu_to_be32(pd->ibpd.local_dma_lkey);
-}
-
-static __be32 send_ieth(const struct ib_send_wr *wr)
-{
-	switch (wr->opcode) {
-	case IB_WR_SEND_WITH_IMM:
-	case IB_WR_RDMA_WRITE_WITH_IMM:
-		return wr->ex.imm_data;
-
-	case IB_WR_SEND_WITH_INV:
-		return cpu_to_be32(wr->ex.invalidate_rkey);
-
-	default:
-		return 0;
-	}
-}
-
-static u8 calc_sig(void *wqe, int size)
-{
-	u8 *p = wqe;
-	u8 res = 0;
-	int i;
-
-	for (i = 0; i < size; i++)
-		res ^= p[i];
-
-	return ~res;
-}
-
-static u8 wq_sig(void *wqe)
-{
-	return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4);
-}
-
-static int set_data_inl_seg(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr,
-			    void **wqe, int *wqe_sz, void **cur_edge)
-{
-	struct mlx5_wqe_inline_seg *seg;
-	size_t offset;
-	int inl = 0;
-	int i;
-
-	seg = *wqe;
-	*wqe += sizeof(*seg);
-	offset = sizeof(*seg);
-
-	for (i = 0; i < wr->num_sge; i++) {
-		size_t len  = wr->sg_list[i].length;
-		void *addr = (void *)(unsigned long)(wr->sg_list[i].addr);
-
-		inl += len;
-
-		if (unlikely(inl > qp->max_inline_data))
-			return -ENOMEM;
-
-		while (likely(len)) {
-			size_t leftlen;
-			size_t copysz;
-
-			handle_post_send_edge(&qp->sq, wqe,
-					      *wqe_sz + (offset >> 4),
-					      cur_edge);
-
-			leftlen = *cur_edge - *wqe;
-			copysz = min_t(size_t, leftlen, len);
-
-			memcpy(*wqe, addr, copysz);
-			len -= copysz;
-			addr += copysz;
-			*wqe += copysz;
-			offset += copysz;
-		}
-	}
-
-	seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG);
-
-	*wqe_sz +=  ALIGN(inl + sizeof(seg->byte_count), 16) / 16;
-
-	return 0;
-}
-
-static u16 prot_field_size(enum ib_signature_type type)
-{
-	switch (type) {
-	case IB_SIG_TYPE_T10_DIF:
-		return MLX5_DIF_SIZE;
-	default:
-		return 0;
-	}
-}
-
-static u8 bs_selector(int block_size)
-{
-	switch (block_size) {
-	case 512:	    return 0x1;
-	case 520:	    return 0x2;
-	case 4096:	    return 0x3;
-	case 4160:	    return 0x4;
-	case 1073741824:    return 0x5;
-	default:	    return 0;
-	}
-}
-
-static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain,
-			      struct mlx5_bsf_inl *inl)
-{
-	/* Valid inline section and allow BSF refresh */
-	inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID |
-				       MLX5_BSF_REFRESH_DIF);
-	inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag);
-	inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag);
-	/* repeating block */
-	inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK;
-	inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ?
-			MLX5_DIF_CRC : MLX5_DIF_IPCS;
-
-	if (domain->sig.dif.ref_remap)
-		inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG;
-
-	if (domain->sig.dif.app_escape) {
-		if (domain->sig.dif.ref_escape)
-			inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE;
-		else
-			inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE;
-	}
-
-	inl->dif_app_bitmask_check =
-		cpu_to_be16(domain->sig.dif.apptag_check_mask);
-}
-
-static int mlx5_set_bsf(struct ib_mr *sig_mr,
-			struct ib_sig_attrs *sig_attrs,
-			struct mlx5_bsf *bsf, u32 data_size)
-{
-	struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig;
-	struct mlx5_bsf_basic *basic = &bsf->basic;
-	struct ib_sig_domain *mem = &sig_attrs->mem;
-	struct ib_sig_domain *wire = &sig_attrs->wire;
-
-	memset(bsf, 0, sizeof(*bsf));
-
-	/* Basic + Extended + Inline */
-	basic->bsf_size_sbs = 1 << 7;
-	/* Input domain check byte mask */
-	basic->check_byte_mask = sig_attrs->check_mask;
-	basic->raw_data_size = cpu_to_be32(data_size);
-
-	/* Memory domain */
-	switch (sig_attrs->mem.sig_type) {
-	case IB_SIG_TYPE_NONE:
-		break;
-	case IB_SIG_TYPE_T10_DIF:
-		basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval);
-		basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx);
-		mlx5_fill_inl_bsf(mem, &bsf->m_inl);
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	/* Wire domain */
-	switch (sig_attrs->wire.sig_type) {
-	case IB_SIG_TYPE_NONE:
-		break;
-	case IB_SIG_TYPE_T10_DIF:
-		if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval &&
-		    mem->sig_type == wire->sig_type) {
-			/* Same block structure */
-			basic->bsf_size_sbs |= 1 << 4;
-			if (mem->sig.dif.bg_type == wire->sig.dif.bg_type)
-				basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK;
-			if (mem->sig.dif.app_tag == wire->sig.dif.app_tag)
-				basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK;
-			if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag)
-				basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK;
-		} else
-			basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval);
-
-		basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx);
-		mlx5_fill_inl_bsf(wire, &bsf->w_inl);
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int set_sig_data_segment(const struct ib_send_wr *send_wr,
-				struct ib_mr *sig_mr,
-				struct ib_sig_attrs *sig_attrs,
-				struct mlx5_ib_qp *qp, void **seg, int *size,
-				void **cur_edge)
-{
-	struct mlx5_bsf *bsf;
-	u32 data_len;
-	u32 data_key;
-	u64 data_va;
-	u32 prot_len = 0;
-	u32 prot_key = 0;
-	u64 prot_va = 0;
-	bool prot = false;
-	int ret;
-	int wqe_size;
-	struct mlx5_ib_mr *mr = to_mmr(sig_mr);
-	struct mlx5_ib_mr *pi_mr = mr->pi_mr;
-
-	data_len = pi_mr->data_length;
-	data_key = pi_mr->ibmr.lkey;
-	data_va = pi_mr->data_iova;
-	if (pi_mr->meta_ndescs) {
-		prot_len = pi_mr->meta_length;
-		prot_key = pi_mr->ibmr.lkey;
-		prot_va = pi_mr->pi_iova;
-		prot = true;
-	}
-
-	if (!prot || (data_key == prot_key && data_va == prot_va &&
-		      data_len == prot_len)) {
-		/**
-		 * Source domain doesn't contain signature information
-		 * or data and protection are interleaved in memory.
-		 * So need construct:
-		 *                  ------------------
-		 *                 |     data_klm     |
-		 *                  ------------------
-		 *                 |       BSF        |
-		 *                  ------------------
-		 **/
-		struct mlx5_klm *data_klm = *seg;
-
-		data_klm->bcount = cpu_to_be32(data_len);
-		data_klm->key = cpu_to_be32(data_key);
-		data_klm->va = cpu_to_be64(data_va);
-		wqe_size = ALIGN(sizeof(*data_klm), 64);
-	} else {
-		/**
-		 * Source domain contains signature information
-		 * So need construct a strided block format:
-		 *               ---------------------------
-		 *              |     stride_block_ctrl     |
-		 *               ---------------------------
-		 *              |          data_klm         |
-		 *               ---------------------------
-		 *              |          prot_klm         |
-		 *               ---------------------------
-		 *              |             BSF           |
-		 *               ---------------------------
-		 **/
-		struct mlx5_stride_block_ctrl_seg *sblock_ctrl;
-		struct mlx5_stride_block_entry *data_sentry;
-		struct mlx5_stride_block_entry *prot_sentry;
-		u16 block_size = sig_attrs->mem.sig.dif.pi_interval;
-		int prot_size;
-
-		sblock_ctrl = *seg;
-		data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl);
-		prot_sentry = (void *)data_sentry + sizeof(*data_sentry);
-
-		prot_size = prot_field_size(sig_attrs->mem.sig_type);
-		if (!prot_size) {
-			pr_err("Bad block size given: %u\n", block_size);
-			return -EINVAL;
-		}
-		sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size +
-							    prot_size);
-		sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP);
-		sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size);
-		sblock_ctrl->num_entries = cpu_to_be16(2);
-
-		data_sentry->bcount = cpu_to_be16(block_size);
-		data_sentry->key = cpu_to_be32(data_key);
-		data_sentry->va = cpu_to_be64(data_va);
-		data_sentry->stride = cpu_to_be16(block_size);
-
-		prot_sentry->bcount = cpu_to_be16(prot_size);
-		prot_sentry->key = cpu_to_be32(prot_key);
-		prot_sentry->va = cpu_to_be64(prot_va);
-		prot_sentry->stride = cpu_to_be16(prot_size);
-
-		wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) +
-				 sizeof(*prot_sentry), 64);
-	}
-
-	*seg += wqe_size;
-	*size += wqe_size / 16;
-	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
-
-	bsf = *seg;
-	ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len);
-	if (ret)
-		return -EINVAL;
-
-	*seg += sizeof(*bsf);
-	*size += sizeof(*bsf) / 16;
-	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
-
-	return 0;
-}
-
-static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg,
-				 struct ib_mr *sig_mr, int access_flags,
-				 u32 size, u32 length, u32 pdn)
-{
-	u32 sig_key = sig_mr->rkey;
-	u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1;
-
-	memset(seg, 0, sizeof(*seg));
-
-	seg->flags = get_umr_flags(access_flags) | MLX5_MKC_ACCESS_MODE_KLMS;
-	seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00);
-	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 |
-				    MLX5_MKEY_BSF_EN | pdn);
-	seg->len = cpu_to_be64(length);
-	seg->xlt_oct_size = cpu_to_be32(get_xlt_octo(size));
-	seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE);
-}
-
-static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
-				u32 size)
-{
-	memset(umr, 0, sizeof(*umr));
-
-	umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE;
-	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size));
-	umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE);
-	umr->mkey_mask = sig_mkey_mask();
-}
-
-static int set_pi_umr_wr(const struct ib_send_wr *send_wr,
-			 struct mlx5_ib_qp *qp, void **seg, int *size,
-			 void **cur_edge)
-{
-	const struct ib_reg_wr *wr = reg_wr(send_wr);
-	struct mlx5_ib_mr *sig_mr = to_mmr(wr->mr);
-	struct mlx5_ib_mr *pi_mr = sig_mr->pi_mr;
-	struct ib_sig_attrs *sig_attrs = sig_mr->ibmr.sig_attrs;
-	u32 pdn = get_pd(qp)->pdn;
-	u32 xlt_size;
-	int region_len, ret;
-
-	if (unlikely(send_wr->num_sge != 0) ||
-	    unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) ||
-	    unlikely(!sig_mr->sig) || unlikely(!qp->ibqp.integrity_en) ||
-	    unlikely(!sig_mr->sig->sig_status_checked))
-		return -EINVAL;
-
-	/* length of the protected region, data + protection */
-	region_len = pi_mr->ibmr.length;
-
-	/**
-	 * KLM octoword size - if protection was provided
-	 * then we use strided block format (3 octowords),
-	 * else we use single KLM (1 octoword)
-	 **/
-	if (sig_attrs->mem.sig_type != IB_SIG_TYPE_NONE)
-		xlt_size = 0x30;
-	else
-		xlt_size = sizeof(struct mlx5_klm);
-
-	set_sig_umr_segment(*seg, xlt_size);
-	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
-	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
-	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
-
-	set_sig_mkey_segment(*seg, wr->mr, wr->access, xlt_size, region_len,
-			     pdn);
-	*seg += sizeof(struct mlx5_mkey_seg);
-	*size += sizeof(struct mlx5_mkey_seg) / 16;
-	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
-
-	ret = set_sig_data_segment(send_wr, wr->mr, sig_attrs, qp, seg, size,
-				   cur_edge);
-	if (ret)
-		return ret;
-
-	sig_mr->sig->sig_status_checked = false;
-	return 0;
-}
-
-static int set_psv_wr(struct ib_sig_domain *domain,
-		      u32 psv_idx, void **seg, int *size)
-{
-	struct mlx5_seg_set_psv *psv_seg = *seg;
-
-	memset(psv_seg, 0, sizeof(*psv_seg));
-	psv_seg->psv_num = cpu_to_be32(psv_idx);
-	switch (domain->sig_type) {
-	case IB_SIG_TYPE_NONE:
-		break;
-	case IB_SIG_TYPE_T10_DIF:
-		psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 |
-						     domain->sig.dif.app_tag);
-		psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag);
-		break;
-	default:
-		pr_err("Bad signature type (%d) is given.\n",
-		       domain->sig_type);
-		return -EINVAL;
-	}
-
-	*seg += sizeof(*psv_seg);
-	*size += sizeof(*psv_seg) / 16;
-
-	return 0;
-}
-
-static int set_reg_wr(struct mlx5_ib_qp *qp,
-		      const struct ib_reg_wr *wr,
-		      void **seg, int *size, void **cur_edge,
-		      bool check_not_free)
-{
-	struct mlx5_ib_mr *mr = to_mmr(wr->mr);
-	struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd);
-	struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device);
-	int mr_list_size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
-	bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD;
-	bool atomic = wr->access & IB_ACCESS_REMOTE_ATOMIC;
-	u8 flags = 0;
-
-	if (!mlx5_ib_can_use_umr(dev, atomic)) {
-		mlx5_ib_warn(to_mdev(qp->ibqp.device),
-			     "Fast update of %s for MR is disabled\n",
-			     (MLX5_CAP_GEN(dev->mdev,
-					   umr_modify_entity_size_disabled)) ?
-				     "entity size" :
-				     "atomic access");
-		return -EINVAL;
-	}
-
-	if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) {
-		mlx5_ib_warn(to_mdev(qp->ibqp.device),
-			     "Invalid IB_SEND_INLINE send flag\n");
-		return -EINVAL;
-	}
-
-	if (check_not_free)
-		flags |= MLX5_UMR_CHECK_NOT_FREE;
-	if (umr_inline)
-		flags |= MLX5_UMR_INLINE;
-
-	set_reg_umr_seg(*seg, mr, flags, atomic);
-	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
-	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
-	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
-
-	set_reg_mkey_seg(*seg, mr, wr->key, wr->access);
-	*seg += sizeof(struct mlx5_mkey_seg);
-	*size += sizeof(struct mlx5_mkey_seg) / 16;
-	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
-
-	if (umr_inline) {
-		memcpy_send_wqe(&qp->sq, cur_edge, seg, size, mr->descs,
-				mr_list_size);
-		*size = ALIGN(*size, MLX5_SEND_WQE_BB >> 4);
-	} else {
-		set_reg_data_seg(*seg, mr, pd);
-		*seg += sizeof(struct mlx5_wqe_data_seg);
-		*size += (sizeof(struct mlx5_wqe_data_seg) / 16);
-	}
-	return 0;
-}
-
-static void set_linv_wr(struct mlx5_ib_qp *qp, void **seg, int *size,
-			void **cur_edge)
-{
-	set_linv_umr_seg(*seg);
-	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
-	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
-	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
-	set_linv_mkey_seg(*seg);
-	*seg += sizeof(struct mlx5_mkey_seg);
-	*size += sizeof(struct mlx5_mkey_seg) / 16;
-	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
-}
-
-static void dump_wqe(struct mlx5_ib_qp *qp, u32 idx, int size_16)
-{
-	__be32 *p = NULL;
-	int i, j;
-
-	pr_debug("dump WQE index %u:\n", idx);
-	for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) {
-		if ((i & 0xf) == 0) {
-			p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx);
-			pr_debug("WQBB at %p:\n", (void *)p);
-			j = 0;
-			idx = (idx + 1) & (qp->sq.wqe_cnt - 1);
-		}
-		pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]),
-			 be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]),
-			 be32_to_cpu(p[j + 3]));
-	}
-}
-
-static int __begin_wqe(struct mlx5_ib_qp *qp, void **seg,
-		       struct mlx5_wqe_ctrl_seg **ctrl,
-		       const struct ib_send_wr *wr, unsigned int *idx,
-		       int *size, void **cur_edge, int nreq,
-		       bool send_signaled, bool solicited)
-{
-	if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)))
-		return -ENOMEM;
-
-	*idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
-	*seg = mlx5_frag_buf_get_wqe(&qp->sq.fbc, *idx);
-	*ctrl = *seg;
-	*(uint32_t *)(*seg + 8) = 0;
-	(*ctrl)->imm = send_ieth(wr);
-	(*ctrl)->fm_ce_se = qp->sq_signal_bits |
-		(send_signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0) |
-		(solicited ? MLX5_WQE_CTRL_SOLICITED : 0);
-
-	*seg += sizeof(**ctrl);
-	*size = sizeof(**ctrl) / 16;
-	*cur_edge = qp->sq.cur_edge;
-
-	return 0;
-}
-
-static int begin_wqe(struct mlx5_ib_qp *qp, void **seg,
-		     struct mlx5_wqe_ctrl_seg **ctrl,
-		     const struct ib_send_wr *wr, unsigned *idx,
-		     int *size, void **cur_edge, int nreq)
-{
-	return __begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq,
-			   wr->send_flags & IB_SEND_SIGNALED,
-			   wr->send_flags & IB_SEND_SOLICITED);
-}
-
-static void finish_wqe(struct mlx5_ib_qp *qp,
-		       struct mlx5_wqe_ctrl_seg *ctrl,
-		       void *seg, u8 size, void *cur_edge,
-		       unsigned int idx, u64 wr_id, int nreq, u8 fence,
-		       u32 mlx5_opcode)
-{
-	u8 opmod = 0;
-
-	ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) |
-					     mlx5_opcode | ((u32)opmod << 24));
-	ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8));
-	ctrl->fm_ce_se |= fence;
-	if (unlikely(qp->wq_sig))
-		ctrl->signature = wq_sig(ctrl);
-
-	qp->sq.wrid[idx] = wr_id;
-	qp->sq.w_list[idx].opcode = mlx5_opcode;
-	qp->sq.wqe_head[idx] = qp->sq.head + nreq;
-	qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
-	qp->sq.w_list[idx].next = qp->sq.cur_post;
-
-	/* We save the edge which was possibly updated during the WQE
-	 * construction, into SQ's cache.
-	 */
-	seg = PTR_ALIGN(seg, MLX5_SEND_WQE_BB);
-	qp->sq.cur_edge = (unlikely(seg == cur_edge)) ?
-			  get_sq_edge(&qp->sq, qp->sq.cur_post &
-				      (qp->sq.wqe_cnt - 1)) :
-			  cur_edge;
-}
-
-static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
-			      const struct ib_send_wr **bad_wr, bool drain)
-{
-	struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
-	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
-	struct mlx5_core_dev *mdev = dev->mdev;
-	struct ib_reg_wr reg_pi_wr;
-	struct mlx5_ib_qp *qp;
-	struct mlx5_ib_mr *mr;
-	struct mlx5_ib_mr *pi_mr;
-	struct mlx5_ib_mr pa_pi_mr;
-	struct ib_sig_attrs *sig_attrs;
-	struct mlx5_wqe_xrc_seg *xrc;
-	struct mlx5_bf *bf;
-	void *cur_edge;
-	int uninitialized_var(size);
-	unsigned long flags;
-	unsigned idx;
-	int err = 0;
-	int num_sge;
-	void *seg;
-	int nreq;
-	int i;
-	u8 next_fence = 0;
-	u8 fence;
-
-	if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR &&
-		     !drain)) {
-		*bad_wr = wr;
-		return -EIO;
-	}
-
-	if (unlikely(ibqp->qp_type == IB_QPT_GSI))
-		return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr);
-
-	qp = to_mqp(ibqp);
-	bf = &qp->bf;
-
-	spin_lock_irqsave(&qp->sq.lock, flags);
-
-	for (nreq = 0; wr; nreq++, wr = wr->next) {
-		if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) {
-			mlx5_ib_warn(dev, "\n");
-			err = -EINVAL;
-			*bad_wr = wr;
-			goto out;
-		}
-
-		num_sge = wr->num_sge;
-		if (unlikely(num_sge > qp->sq.max_gs)) {
-			mlx5_ib_warn(dev, "\n");
-			err = -EINVAL;
-			*bad_wr = wr;
-			goto out;
-		}
-
-		err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, &cur_edge,
-				nreq);
-		if (err) {
-			mlx5_ib_warn(dev, "\n");
-			err = -ENOMEM;
-			*bad_wr = wr;
-			goto out;
-		}
-
-		if (wr->opcode == IB_WR_REG_MR ||
-		    wr->opcode == IB_WR_REG_MR_INTEGRITY) {
-			fence = dev->umr_fence;
-			next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
-		} else  {
-			if (wr->send_flags & IB_SEND_FENCE) {
-				if (qp->next_fence)
-					fence = MLX5_FENCE_MODE_SMALL_AND_FENCE;
-				else
-					fence = MLX5_FENCE_MODE_FENCE;
-			} else {
-				fence = qp->next_fence;
-			}
-		}
-
-		switch (ibqp->qp_type) {
-		case IB_QPT_XRC_INI:
-			xrc = seg;
-			seg += sizeof(*xrc);
-			size += sizeof(*xrc) / 16;
-			/* fall through */
-		case IB_QPT_RC:
-			switch (wr->opcode) {
-			case IB_WR_RDMA_READ:
-			case IB_WR_RDMA_WRITE:
-			case IB_WR_RDMA_WRITE_WITH_IMM:
-				set_raddr_seg(seg, rdma_wr(wr)->remote_addr,
-					      rdma_wr(wr)->rkey);
-				seg += sizeof(struct mlx5_wqe_raddr_seg);
-				size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
-				break;
-
-			case IB_WR_ATOMIC_CMP_AND_SWP:
-			case IB_WR_ATOMIC_FETCH_AND_ADD:
-			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
-				mlx5_ib_warn(dev, "Atomic operations are not supported yet\n");
-				err = -ENOSYS;
-				*bad_wr = wr;
-				goto out;
-
-			case IB_WR_LOCAL_INV:
-				qp->sq.wr_data[idx] = IB_WR_LOCAL_INV;
-				ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey);
-				set_linv_wr(qp, &seg, &size, &cur_edge);
-				num_sge = 0;
-				break;
-
-			case IB_WR_REG_MR:
-				qp->sq.wr_data[idx] = IB_WR_REG_MR;
-				ctrl->imm = cpu_to_be32(reg_wr(wr)->key);
-				err = set_reg_wr(qp, reg_wr(wr), &seg, &size,
-						 &cur_edge, true);
-				if (err) {
-					*bad_wr = wr;
-					goto out;
-				}
-				num_sge = 0;
-				break;
-
-			case IB_WR_REG_MR_INTEGRITY:
-				qp->sq.wr_data[idx] = IB_WR_REG_MR_INTEGRITY;
-
-				mr = to_mmr(reg_wr(wr)->mr);
-				pi_mr = mr->pi_mr;
-
-				if (pi_mr) {
-					memset(&reg_pi_wr, 0,
-					       sizeof(struct ib_reg_wr));
-
-					reg_pi_wr.mr = &pi_mr->ibmr;
-					reg_pi_wr.access = reg_wr(wr)->access;
-					reg_pi_wr.key = pi_mr->ibmr.rkey;
-
-					ctrl->imm = cpu_to_be32(reg_pi_wr.key);
-					/* UMR for data + prot registration */
-					err = set_reg_wr(qp, &reg_pi_wr, &seg,
-							 &size, &cur_edge,
-							 false);
-					if (err) {
-						*bad_wr = wr;
-						goto out;
-					}
-					finish_wqe(qp, ctrl, seg, size,
-						   cur_edge, idx, wr->wr_id,
-						   nreq, fence,
-						   MLX5_OPCODE_UMR);
-
-					err = begin_wqe(qp, &seg, &ctrl, wr,
-							&idx, &size, &cur_edge,
-							nreq);
-					if (err) {
-						mlx5_ib_warn(dev, "\n");
-						err = -ENOMEM;
-						*bad_wr = wr;
-						goto out;
-					}
-				} else {
-					memset(&pa_pi_mr, 0,
-					       sizeof(struct mlx5_ib_mr));
-					/* No UMR, use local_dma_lkey */
-					pa_pi_mr.ibmr.lkey =
-						mr->ibmr.pd->local_dma_lkey;
-
-					pa_pi_mr.ndescs = mr->ndescs;
-					pa_pi_mr.data_length = mr->data_length;
-					pa_pi_mr.data_iova = mr->data_iova;
-					if (mr->meta_ndescs) {
-						pa_pi_mr.meta_ndescs =
-							mr->meta_ndescs;
-						pa_pi_mr.meta_length =
-							mr->meta_length;
-						pa_pi_mr.pi_iova = mr->pi_iova;
-					}
-
-					pa_pi_mr.ibmr.length = mr->ibmr.length;
-					mr->pi_mr = &pa_pi_mr;
-				}
-				ctrl->imm = cpu_to_be32(mr->ibmr.rkey);
-				/* UMR for sig MR */
-				err = set_pi_umr_wr(wr, qp, &seg, &size,
-						    &cur_edge);
-				if (err) {
-					mlx5_ib_warn(dev, "\n");
-					*bad_wr = wr;
-					goto out;
-				}
-				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
-					   wr->wr_id, nreq, fence,
-					   MLX5_OPCODE_UMR);
-
-				/*
-				 * SET_PSV WQEs are not signaled and solicited
-				 * on error
-				 */
-				sig_attrs = mr->ibmr.sig_attrs;
-				err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
-						  &size, &cur_edge, nreq, false,
-						  true);
-				if (err) {
-					mlx5_ib_warn(dev, "\n");
-					err = -ENOMEM;
-					*bad_wr = wr;
-					goto out;
-				}
-				err = set_psv_wr(&sig_attrs->mem,
-						 mr->sig->psv_memory.psv_idx,
-						 &seg, &size);
-				if (err) {
-					mlx5_ib_warn(dev, "\n");
-					*bad_wr = wr;
-					goto out;
-				}
-				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
-					   wr->wr_id, nreq, next_fence,
-					   MLX5_OPCODE_SET_PSV);
-
-				err = __begin_wqe(qp, &seg, &ctrl, wr, &idx,
-						  &size, &cur_edge, nreq, false,
-						  true);
-				if (err) {
-					mlx5_ib_warn(dev, "\n");
-					err = -ENOMEM;
-					*bad_wr = wr;
-					goto out;
-				}
-				err = set_psv_wr(&sig_attrs->wire,
-						 mr->sig->psv_wire.psv_idx,
-						 &seg, &size);
-				if (err) {
-					mlx5_ib_warn(dev, "\n");
-					*bad_wr = wr;
-					goto out;
-				}
-				finish_wqe(qp, ctrl, seg, size, cur_edge, idx,
-					   wr->wr_id, nreq, next_fence,
-					   MLX5_OPCODE_SET_PSV);
-
-				qp->next_fence =
-					MLX5_FENCE_MODE_INITIATOR_SMALL;
-				num_sge = 0;
-				goto skip_psv;
-
-			default:
-				break;
-			}
-			break;
-
-		case IB_QPT_UC:
-			switch (wr->opcode) {
-			case IB_WR_RDMA_WRITE:
-			case IB_WR_RDMA_WRITE_WITH_IMM:
-				set_raddr_seg(seg, rdma_wr(wr)->remote_addr,
-					      rdma_wr(wr)->rkey);
-				seg  += sizeof(struct mlx5_wqe_raddr_seg);
-				size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
-				break;
-
-			default:
-				break;
-			}
-			break;
-
-		case IB_QPT_SMI:
-			if (unlikely(!mdev->port_caps[qp->port - 1].has_smi)) {
-				mlx5_ib_warn(dev, "Send SMP MADs is not allowed\n");
-				err = -EPERM;
-				*bad_wr = wr;
-				goto out;
-			}
-			/* fall through */
-		case MLX5_IB_QPT_HW_GSI:
-			set_datagram_seg(seg, wr);
-			seg += sizeof(struct mlx5_wqe_datagram_seg);
-			size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
-			handle_post_send_edge(&qp->sq, &seg, size, &cur_edge);
-
-			break;
-		case IB_QPT_UD:
-			set_datagram_seg(seg, wr);
-			seg += sizeof(struct mlx5_wqe_datagram_seg);
-			size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
-			handle_post_send_edge(&qp->sq, &seg, size, &cur_edge);
-
-			/* handle qp that supports ud offload */
-			if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) {
-				struct mlx5_wqe_eth_pad *pad;
-
-				pad = seg;
-				memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad));
-				seg += sizeof(struct mlx5_wqe_eth_pad);
-				size += sizeof(struct mlx5_wqe_eth_pad) / 16;
-				set_eth_seg(wr, qp, &seg, &size, &cur_edge);
-				handle_post_send_edge(&qp->sq, &seg, size,
-						      &cur_edge);
-			}
-			break;
-		case MLX5_IB_QPT_REG_UMR:
-			if (wr->opcode != MLX5_IB_WR_UMR) {
-				err = -EINVAL;
-				mlx5_ib_warn(dev, "bad opcode\n");
-				goto out;
-			}
-			qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
-			ctrl->imm = cpu_to_be32(umr_wr(wr)->mkey);
-			err = set_reg_umr_segment(dev, seg, wr, !!(MLX5_CAP_GEN(mdev, atomic)));
-			if (unlikely(err))
-				goto out;
-			seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
-			size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
-			handle_post_send_edge(&qp->sq, &seg, size, &cur_edge);
-			set_reg_mkey_segment(seg, wr);
-			seg += sizeof(struct mlx5_mkey_seg);
-			size += sizeof(struct mlx5_mkey_seg) / 16;
-			handle_post_send_edge(&qp->sq, &seg, size, &cur_edge);
-			break;
-
-		default:
-			break;
-		}
-
-		if (wr->send_flags & IB_SEND_INLINE && num_sge) {
-			err = set_data_inl_seg(qp, wr, &seg, &size, &cur_edge);
-			if (unlikely(err)) {
-				mlx5_ib_warn(dev, "\n");
-				*bad_wr = wr;
-				goto out;
-			}
-		} else {
-			for (i = 0; i < num_sge; i++) {
-				handle_post_send_edge(&qp->sq, &seg, size,
-						      &cur_edge);
-				if (likely(wr->sg_list[i].length)) {
-					set_data_ptr_seg
-					((struct mlx5_wqe_data_seg *)seg,
-					 wr->sg_list + i);
-					size += sizeof(struct mlx5_wqe_data_seg) / 16;
-					seg += sizeof(struct mlx5_wqe_data_seg);
-				}
-			}
-		}
-
-		qp->next_fence = next_fence;
-		finish_wqe(qp, ctrl, seg, size, cur_edge, idx, wr->wr_id, nreq,
-			   fence, mlx5_ib_opcode[wr->opcode]);
-skip_psv:
-		if (0)
-			dump_wqe(qp, idx, size);
-	}
-
-out:
-	if (likely(nreq)) {
-		qp->sq.head += nreq;
-
-		/* Make sure that descriptors are written before
-		 * updating doorbell record and ringing the doorbell
-		 */
-		wmb();
-
-		qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post);
-
-		/* Make sure doorbell record is visible to the HCA before
-		 * we hit doorbell */
-		wmb();
-
-		/* currently we support only regular doorbells */
-		mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset);
-		/* Make sure doorbells don't leak out of SQ spinlock
-		 * and reach the HCA out of order.
-		 */
-		bf->offset ^= bf->buf_size;
-	}
-
-	spin_unlock_irqrestore(&qp->sq.lock, flags);
-
-	return err;
-}
-
-int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
-		      const struct ib_send_wr **bad_wr)
-{
-	return _mlx5_ib_post_send(ibqp, wr, bad_wr, false);
-}
-
-static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size)
-{
-	sig->signature = calc_sig(sig, size);
-}
-
-static int _mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
-		      const struct ib_recv_wr **bad_wr, bool drain)
-{
-	struct mlx5_ib_qp *qp = to_mqp(ibqp);
-	struct mlx5_wqe_data_seg *scat;
-	struct mlx5_rwqe_sig *sig;
-	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
-	struct mlx5_core_dev *mdev = dev->mdev;
-	unsigned long flags;
-	int err = 0;
-	int nreq;
-	int ind;
-	int i;
-
-	if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR &&
-		     !drain)) {
-		*bad_wr = wr;
-		return -EIO;
-	}
-
-	if (unlikely(ibqp->qp_type == IB_QPT_GSI))
-		return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr);
-
-	spin_lock_irqsave(&qp->rq.lock, flags);
-
-	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
-
-	for (nreq = 0; wr; nreq++, wr = wr->next) {
-		if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
-			err = -ENOMEM;
-			*bad_wr = wr;
-			goto out;
-		}
-
-		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
-			err = -EINVAL;
-			*bad_wr = wr;
-			goto out;
-		}
-
-		scat = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ind);
-		if (qp->wq_sig)
-			scat++;
-
-		for (i = 0; i < wr->num_sge; i++)
-			set_data_ptr_seg(scat + i, wr->sg_list + i);
-
-		if (i < qp->rq.max_gs) {
-			scat[i].byte_count = 0;
-			scat[i].lkey       = cpu_to_be32(MLX5_INVALID_LKEY);
-			scat[i].addr       = 0;
-		}
-
-		if (qp->wq_sig) {
-			sig = (struct mlx5_rwqe_sig *)scat;
-			set_sig_seg(sig, (qp->rq.max_gs + 1) << 2);
-		}
-
-		qp->rq.wrid[ind] = wr->wr_id;
-
-		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
-	}
-
-out:
-	if (likely(nreq)) {
-		qp->rq.head += nreq;
-
-		/* Make sure that descriptors are written before
-		 * doorbell record.
-		 */
-		wmb();
-
-		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
-	}
-
-	spin_unlock_irqrestore(&qp->rq.lock, flags);
-
-	return err;
-}
-
-int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
-		      const struct ib_recv_wr **bad_wr)
-{
-	return _mlx5_ib_post_recv(ibqp, wr, bad_wr, false);
-}
-
 static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state)
 {
 	switch (mlx5_state) {
@@ -5465,50 +4412,34 @@
 	}
 }
 
-static int to_ib_qp_access_flags(int mlx5_flags)
-{
-	int ib_flags = 0;
-
-	if (mlx5_flags & MLX5_QP_BIT_RRE)
-		ib_flags |= IB_ACCESS_REMOTE_READ;
-	if (mlx5_flags & MLX5_QP_BIT_RWE)
-		ib_flags |= IB_ACCESS_REMOTE_WRITE;
-	if (mlx5_flags & MLX5_QP_BIT_RAE)
-		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
-
-	return ib_flags;
-}
-
 static void to_rdma_ah_attr(struct mlx5_ib_dev *ibdev,
-			    struct rdma_ah_attr *ah_attr,
-			    struct mlx5_qp_path *path)
+			    struct rdma_ah_attr *ah_attr, void *path)
 {
+	int port = MLX5_GET(ads, path, vhca_port_num);
+	int static_rate;
 
 	memset(ah_attr, 0, sizeof(*ah_attr));
 
-	if (!path->port || path->port > ibdev->num_ports)
+	if (!port || port > ibdev->num_ports)
 		return;
 
-	ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, path->port);
+	ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, port);
 
-	rdma_ah_set_port_num(ah_attr, path->port);
-	rdma_ah_set_sl(ah_attr, path->dci_cfi_prio_sl & 0xf);
+	rdma_ah_set_port_num(ah_attr, port);
+	rdma_ah_set_sl(ah_attr, MLX5_GET(ads, path, sl));
 
-	rdma_ah_set_dlid(ah_attr, be16_to_cpu(path->rlid));
-	rdma_ah_set_path_bits(ah_attr, path->grh_mlid & 0x7f);
-	rdma_ah_set_static_rate(ah_attr,
-				path->static_rate ? path->static_rate - 5 : 0);
+	rdma_ah_set_dlid(ah_attr, MLX5_GET(ads, path, rlid));
+	rdma_ah_set_path_bits(ah_attr, MLX5_GET(ads, path, mlid));
 
-	if (path->grh_mlid & (1 << 7) ||
+	static_rate = MLX5_GET(ads, path, stat_rate);
+	rdma_ah_set_static_rate(ah_attr, mlx5_to_ib_rate_map(static_rate));
+	if (MLX5_GET(ads, path, grh) ||
 	    ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
-		u32 tc_fl = be32_to_cpu(path->tclass_flowlabel);
-
-		rdma_ah_set_grh(ah_attr, NULL,
-				tc_fl & 0xfffff,
-				path->mgid_index,
-				path->hop_limit,
-				(tc_fl >> 20) & 0xff);
-		rdma_ah_set_dgid_raw(ah_attr, path->rgid);
+		rdma_ah_set_grh(ah_attr, NULL, MLX5_GET(ads, path, flow_label),
+				MLX5_GET(ads, path, src_addr_index),
+				MLX5_GET(ads, path, hop_limit),
+				MLX5_GET(ads, path, tclass));
+		rdma_ah_set_dgid_raw(ah_attr, MLX5_ADDR_OF(ads, path, rgid_rip));
 	}
 }
 
@@ -5565,7 +4496,7 @@
 			[MLX5_SQ_STATE_NA]	= IB_QPS_RESET,
 		},
 		[MLX5_RQC_STATE_RDY] = {
-			[MLX5_SQC_STATE_RST]	= MLX5_QP_STATE_BAD,
+			[MLX5_SQC_STATE_RST]	= MLX5_QP_STATE,
 			[MLX5_SQC_STATE_RDY]	= MLX5_QP_STATE,
 			[MLX5_SQC_STATE_ERR]	= IB_QPS_SQE,
 			[MLX5_SQ_STATE_NA]	= MLX5_QP_STATE,
@@ -5577,7 +4508,7 @@
 			[MLX5_SQ_STATE_NA]	= IB_QPS_ERR,
 		},
 		[MLX5_RQ_STATE_NA] = {
-			[MLX5_SQC_STATE_RST]    = IB_QPS_RESET,
+			[MLX5_SQC_STATE_RST]    = MLX5_QP_STATE,
 			[MLX5_SQC_STATE_RDY]	= MLX5_QP_STATE,
 			[MLX5_SQC_STATE_ERR]	= MLX5_QP_STATE,
 			[MLX5_SQ_STATE_NA]	= MLX5_QP_STATE_BAD,
@@ -5630,61 +4561,58 @@
 			 struct ib_qp_attr *qp_attr)
 {
 	int outlen = MLX5_ST_SZ_BYTES(query_qp_out);
-	struct mlx5_qp_context *context;
-	int mlx5_state;
+	void *qpc, *pri_path, *alt_path;
 	u32 *outb;
-	int err = 0;
+	int err;
 
 	outb = kzalloc(outlen, GFP_KERNEL);
 	if (!outb)
 		return -ENOMEM;
 
-	err = mlx5_core_qp_query(dev->mdev, &qp->trans_qp.base.mqp, outb,
-				 outlen);
+	err = mlx5_core_qp_query(dev, &qp->trans_qp.base.mqp, outb, outlen);
 	if (err)
 		goto out;
 
-	/* FIXME: use MLX5_GET rather than mlx5_qp_context manual struct */
-	context = (struct mlx5_qp_context *)MLX5_ADDR_OF(query_qp_out, outb, qpc);
+	qpc = MLX5_ADDR_OF(query_qp_out, outb, qpc);
 
-	mlx5_state = be32_to_cpu(context->flags) >> 28;
+	qp->state = to_ib_qp_state(MLX5_GET(qpc, qpc, state));
+	if (MLX5_GET(qpc, qpc, state) == MLX5_QP_STATE_SQ_DRAINING)
+		qp_attr->sq_draining = 1;
 
-	qp->state		     = to_ib_qp_state(mlx5_state);
-	qp_attr->path_mtu	     = context->mtu_msgmax >> 5;
-	qp_attr->path_mig_state	     =
-		to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3);
-	qp_attr->qkey		     = be32_to_cpu(context->qkey);
-	qp_attr->rq_psn		     = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff;
-	qp_attr->sq_psn		     = be32_to_cpu(context->next_send_psn) & 0xffffff;
-	qp_attr->dest_qp_num	     = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff;
-	qp_attr->qp_access_flags     =
-		to_ib_qp_access_flags(be32_to_cpu(context->params2));
+	qp_attr->path_mtu = MLX5_GET(qpc, qpc, mtu);
+	qp_attr->path_mig_state = to_ib_mig_state(MLX5_GET(qpc, qpc, pm_state));
+	qp_attr->qkey = MLX5_GET(qpc, qpc, q_key);
+	qp_attr->rq_psn = MLX5_GET(qpc, qpc, next_rcv_psn);
+	qp_attr->sq_psn = MLX5_GET(qpc, qpc, next_send_psn);
+	qp_attr->dest_qp_num = MLX5_GET(qpc, qpc, remote_qpn);
+
+	if (MLX5_GET(qpc, qpc, rre))
+		qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ;
+	if (MLX5_GET(qpc, qpc, rwe))
+		qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (MLX5_GET(qpc, qpc, rae))
+		qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	qp_attr->max_rd_atomic = 1 << MLX5_GET(qpc, qpc, log_sra_max);
+	qp_attr->max_dest_rd_atomic = 1 << MLX5_GET(qpc, qpc, log_rra_max);
+	qp_attr->min_rnr_timer = MLX5_GET(qpc, qpc, min_rnr_nak);
+	qp_attr->retry_cnt = MLX5_GET(qpc, qpc, retry_count);
+	qp_attr->rnr_retry = MLX5_GET(qpc, qpc, rnr_retry);
+
+	pri_path = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
+	alt_path = MLX5_ADDR_OF(qpc, qpc, secondary_address_path);
 
 	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
-		to_rdma_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path);
-		to_rdma_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path);
-		qp_attr->alt_pkey_index =
-			be16_to_cpu(context->alt_path.pkey_index);
-		qp_attr->alt_port_num	=
-			rdma_ah_get_port_num(&qp_attr->alt_ah_attr);
+		to_rdma_ah_attr(dev, &qp_attr->ah_attr, pri_path);
+		to_rdma_ah_attr(dev, &qp_attr->alt_ah_attr, alt_path);
+		qp_attr->alt_pkey_index = MLX5_GET(ads, alt_path, pkey_index);
+		qp_attr->alt_port_num = MLX5_GET(ads, alt_path, vhca_port_num);
 	}
 
-	qp_attr->pkey_index = be16_to_cpu(context->pri_path.pkey_index);
-	qp_attr->port_num = context->pri_path.port;
-
-	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
-	qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING;
-
-	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7);
-
-	qp_attr->max_dest_rd_atomic =
-		1 << ((be32_to_cpu(context->params2) >> 21) & 0x7);
-	qp_attr->min_rnr_timer	    =
-		(be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f;
-	qp_attr->timeout	    = context->pri_path.ackto_lt >> 3;
-	qp_attr->retry_cnt	    = (be32_to_cpu(context->params1) >> 16) & 0x7;
-	qp_attr->rnr_retry	    = (be32_to_cpu(context->params1) >> 13) & 0x7;
-	qp_attr->alt_timeout	    = context->alt_path.ackto_lt >> 3;
+	qp_attr->pkey_index = MLX5_GET(ads, pri_path, pkey_index);
+	qp_attr->port_num = MLX5_GET(ads, pri_path, vhca_port_num);
+	qp_attr->timeout = MLX5_GET(ads, pri_path, ack_timeout);
+	qp_attr->alt_timeout = MLX5_GET(ads, alt_path, ack_timeout);
 
 out:
 	kfree(outb);
@@ -5718,7 +4646,7 @@
 	if (!out)
 		return -ENOMEM;
 
-	err = mlx5_core_dct_query(dev->mdev, dct, out, outlen);
+	err = mlx5_core_dct_query(dev, dct, out, outlen);
 	if (err)
 		goto out;
 
@@ -5775,14 +4703,14 @@
 	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
 	memset(qp_attr, 0, sizeof(*qp_attr));
 
-	if (unlikely(qp->qp_sub_type == MLX5_IB_QPT_DCT))
+	if (unlikely(qp->type == MLX5_IB_QPT_DCT))
 		return mlx5_ib_dct_query_qp(dev, qp, qp_attr,
 					    qp_attr_mask, qp_init_attr);
 
 	mutex_lock(&qp->mutex);
 
 	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET ||
-	    qp->flags & MLX5_IB_QP_UNDERLAY) {
+	    qp->flags & IB_QP_CREATE_SOURCE_QPN) {
 		err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state);
 		if (err)
 			goto out;
@@ -5816,18 +4744,7 @@
 
 	qp_init_attr->cap	     = qp_attr->cap;
 
-	qp_init_attr->create_flags = 0;
-	if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
-		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
-
-	if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL)
-		qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL;
-	if (qp->flags & MLX5_IB_QP_MANAGED_SEND)
-		qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND;
-	if (qp->flags & MLX5_IB_QP_MANAGED_RECV)
-		qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV;
-	if (qp->flags & MLX5_IB_QP_SQPN_QP1)
-		qp_init_attr->create_flags |= mlx5_ib_create_qp_sqpn_qp1();
+	qp_init_attr->create_flags = qp->flags;
 
 	qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ?
 		IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
@@ -5837,41 +4754,23 @@
 	return err;
 }
 
-struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
-				   struct ib_udata *udata)
+int mlx5_ib_alloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata)
 {
-	struct mlx5_ib_dev *dev = to_mdev(ibdev);
-	struct mlx5_ib_xrcd *xrcd;
-	int err;
+	struct mlx5_ib_dev *dev = to_mdev(ibxrcd->device);
+	struct mlx5_ib_xrcd *xrcd = to_mxrcd(ibxrcd);
 
 	if (!MLX5_CAP_GEN(dev->mdev, xrc))
-		return ERR_PTR(-ENOSYS);
+		return -EOPNOTSUPP;
 
-	xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL);
-	if (!xrcd)
-		return ERR_PTR(-ENOMEM);
-
-	err = mlx5_cmd_xrcd_alloc(dev->mdev, &xrcd->xrcdn, 0);
-	if (err) {
-		kfree(xrcd);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	return &xrcd->ibxrcd;
+	return mlx5_cmd_xrcd_alloc(dev->mdev, &xrcd->xrcdn, 0);
 }
 
 int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(xrcd->device);
 	u32 xrcdn = to_mxrcd(xrcd)->xrcdn;
-	int err;
 
-	err = mlx5_cmd_xrcd_dealloc(dev->mdev, xrcdn, 0);
-	if (err)
-		mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn);
-
-	kfree(xrcd);
-	return 0;
+	return mlx5_cmd_xrcd_dealloc(dev->mdev, xrcdn, 0);
 }
 
 static void mlx5_ib_wq_event(struct mlx5_core_qp *core_qp, int type)
@@ -5904,7 +4803,7 @@
 	if (dev->delay_drop.activate)
 		goto out;
 
-	err = mlx5_core_set_delay_drop(dev->mdev, dev->delay_drop.timeout);
+	err = mlx5_core_set_delay_drop(dev, dev->delay_drop.timeout);
 	if (err)
 		goto out;
 
@@ -5959,12 +4858,21 @@
 	}
 	MLX5_SET(wq, wq, log_wq_stride, rwq->log_rq_stride);
 	if (rwq->create_flags & MLX5_IB_WQ_FLAGS_STRIDING_RQ) {
+		/*
+		 * In Firmware number of strides in each WQE is:
+		 *   "512 * 2^single_wqe_log_num_of_strides"
+		 * Values 3 to 8 are accepted as 10 to 15, 9 to 18 are
+		 * accepted as 0 to 9
+		 */
+		static const u8 fw_map[] = { 10, 11, 12, 13, 14, 15, 0, 1,
+					     2,  3,  4,  5,  6,  7,  8, 9 };
 		MLX5_SET(wq, wq, two_byte_shift_en, rwq->two_byte_shift_en);
 		MLX5_SET(wq, wq, log_wqe_stride_size,
 			 rwq->single_stride_log_num_of_bytes -
 			 MLX5_MIN_SINGLE_STRIDE_LOG_NUM_BYTES);
-		MLX5_SET(wq, wq, log_wqe_num_of_strides, rwq->log_num_strides -
-			 MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES);
+		MLX5_SET(wq, wq, log_wqe_num_of_strides,
+			 fw_map[rwq->log_num_strides -
+				MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES]);
 	}
 	MLX5_SET(wq, wq, log_wq_sz, rwq->log_rq_size);
 	MLX5_SET(wq, wq, pd, to_mpd(pd)->pdn);
@@ -6001,13 +4909,13 @@
 	}
 	rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas);
 	mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0);
-	err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rwq->core_qp);
+	err = mlx5_core_create_rq_tracked(dev, in, inlen, &rwq->core_qp);
 	if (!err && init_attr->create_flags & IB_WQ_FLAGS_DELAY_DROP) {
 		err = set_delay_drop(dev);
 		if (err) {
 			mlx5_ib_warn(dev, "Failed to enable delay drop err=%d\n",
 				     err);
-			mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
+			mlx5_core_destroy_rq_tracked(dev, &rwq->core_qp);
 		} else {
 			rwq->create_flags |= MLX5_IB_WQ_FLAGS_DELAY_DROP;
 		}
@@ -6039,6 +4947,19 @@
 	return 0;
 }
 
+static bool log_of_strides_valid(struct mlx5_ib_dev *dev, u32 log_num_strides)
+{
+	if ((log_num_strides > MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES) ||
+	    (log_num_strides < MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES))
+		return false;
+
+	if (!MLX5_CAP_GEN(dev->mdev, ext_stride_num_range) &&
+	    (log_num_strides < MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES))
+		return false;
+
+	return true;
+}
+
 static int prepare_user_rq(struct ib_pd *pd,
 			   struct ib_wq_init_attr *init_attr,
 			   struct ib_udata *udata,
@@ -6049,8 +4970,8 @@
 	int err;
 	size_t required_cmd_sz;
 
-	required_cmd_sz = offsetof(typeof(ucmd), single_stride_log_num_of_bytes)
-		+ sizeof(ucmd.single_stride_log_num_of_bytes);
+	required_cmd_sz = offsetofend(struct mlx5_ib_create_wq,
+				      single_stride_log_num_of_bytes);
 	if (udata->inlen < required_cmd_sz) {
 		mlx5_ib_dbg(dev, "invalid inlen\n");
 		return -EINVAL;
@@ -6086,14 +5007,16 @@
 				    MLX5_MAX_SINGLE_STRIDE_LOG_NUM_BYTES);
 			return -EINVAL;
 		}
-		if ((ucmd.single_wqe_log_num_of_strides >
-		    MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES) ||
-		     (ucmd.single_wqe_log_num_of_strides <
-			MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES)) {
-			mlx5_ib_dbg(dev, "Invalid log num strides (%u. Range is %u - %u)\n",
-				    ucmd.single_wqe_log_num_of_strides,
-				    MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES,
-				    MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES);
+		if (!log_of_strides_valid(dev,
+					  ucmd.single_wqe_log_num_of_strides)) {
+			mlx5_ib_dbg(
+				dev,
+				"Invalid log num strides (%u. Range is %u - %u)\n",
+				ucmd.single_wqe_log_num_of_strides,
+				MLX5_CAP_GEN(dev->mdev, ext_stride_num_range) ?
+					MLX5_EXT_MIN_SINGLE_WQE_LOG_NUM_STRIDES :
+					MLX5_MIN_SINGLE_WQE_LOG_NUM_STRIDES,
+				MLX5_MAX_SINGLE_WQE_LOG_NUM_STRIDES);
 			return -EINVAL;
 		}
 		rwq->single_stride_log_num_of_bytes =
@@ -6132,7 +5055,7 @@
 	if (!udata)
 		return ERR_PTR(-ENOSYS);
 
-	min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+	min_resp_len = offsetofend(struct mlx5_ib_create_wq_resp, reserved);
 	if (udata->outlen && udata->outlen < min_resp_len)
 		return ERR_PTR(-EINVAL);
 
@@ -6162,8 +5085,8 @@
 	rwq->ibwq.wq_num = rwq->core_qp.qpn;
 	rwq->ibwq.state = IB_WQS_RESET;
 	if (udata->outlen) {
-		resp.response_length = offsetof(typeof(resp), response_length) +
-				sizeof(resp.response_length);
+		resp.response_length = offsetofend(
+			struct mlx5_ib_create_wq_resp, response_length);
 		err = ib_copy_to_udata(udata, &resp, resp.response_length);
 		if (err)
 			goto err_copy;
@@ -6174,7 +5097,7 @@
 	return &rwq->ibwq;
 
 err_copy:
-	mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
+	mlx5_core_destroy_rq_tracked(dev, &rwq->core_qp);
 err_user_rq:
 	destroy_user_rq(dev, pd, rwq, udata);
 err:
@@ -6182,22 +5105,27 @@
 	return ERR_PTR(err);
 }
 
-void mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
+int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(wq->device);
 	struct mlx5_ib_rwq *rwq = to_mrwq(wq);
+	int ret;
 
-	mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
+	ret = mlx5_core_destroy_rq_tracked(dev, &rwq->core_qp);
+	if (ret)
+		return ret;
 	destroy_user_rq(dev, wq->pd, rwq, udata);
 	kfree(rwq);
+	return 0;
 }
 
-struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
-						      struct ib_rwq_ind_table_init_attr *init_attr,
-						      struct ib_udata *udata)
+int mlx5_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_table,
+				 struct ib_rwq_ind_table_init_attr *init_attr,
+				 struct ib_udata *udata)
 {
-	struct mlx5_ib_dev *dev = to_mdev(device);
-	struct mlx5_ib_rwq_ind_table *rwq_ind_tbl;
+	struct mlx5_ib_rwq_ind_table *rwq_ind_tbl =
+		to_mrwq_ind_table(ib_rwq_ind_table);
+	struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_table->device);
 	int sz = 1 << init_attr->log_ind_tbl_size;
 	struct mlx5_ib_create_rwq_ind_tbl_resp resp = {};
 	size_t min_resp_len;
@@ -6210,30 +5138,25 @@
 	if (udata->inlen > 0 &&
 	    !ib_is_udata_cleared(udata, 0,
 				 udata->inlen))
-		return ERR_PTR(-EOPNOTSUPP);
+		return -EOPNOTSUPP;
 
 	if (init_attr->log_ind_tbl_size >
 	    MLX5_CAP_GEN(dev->mdev, log_max_rqt_size)) {
 		mlx5_ib_dbg(dev, "log_ind_tbl_size = %d is bigger than supported = %d\n",
 			    init_attr->log_ind_tbl_size,
 			    MLX5_CAP_GEN(dev->mdev, log_max_rqt_size));
-		return ERR_PTR(-EINVAL);
+		return -EINVAL;
 	}
 
-	min_resp_len = offsetof(typeof(resp), reserved) + sizeof(resp.reserved);
+	min_resp_len =
+		offsetofend(struct mlx5_ib_create_rwq_ind_tbl_resp, reserved);
 	if (udata->outlen && udata->outlen < min_resp_len)
-		return ERR_PTR(-EINVAL);
-
-	rwq_ind_tbl = kzalloc(sizeof(*rwq_ind_tbl), GFP_KERNEL);
-	if (!rwq_ind_tbl)
-		return ERR_PTR(-ENOMEM);
+		return -EINVAL;
 
 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
 	in = kvzalloc(inlen, GFP_KERNEL);
-	if (!in) {
-		err = -ENOMEM;
-		goto err;
-	}
+	if (!in)
+		return -ENOMEM;
 
 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
 
@@ -6248,26 +5171,24 @@
 
 	err = mlx5_core_create_rqt(dev->mdev, in, inlen, &rwq_ind_tbl->rqtn);
 	kvfree(in);
-
 	if (err)
-		goto err;
+		return err;
 
 	rwq_ind_tbl->ib_rwq_ind_tbl.ind_tbl_num = rwq_ind_tbl->rqtn;
 	if (udata->outlen) {
-		resp.response_length = offsetof(typeof(resp), response_length) +
-					sizeof(resp.response_length);
+		resp.response_length =
+			offsetofend(struct mlx5_ib_create_rwq_ind_tbl_resp,
+				    response_length);
 		err = ib_copy_to_udata(udata, &resp, resp.response_length);
 		if (err)
 			goto err_copy;
 	}
 
-	return &rwq_ind_tbl->ib_rwq_ind_tbl;
+	return 0;
 
 err_copy:
 	mlx5_cmd_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn, rwq_ind_tbl->uid);
-err:
-	kfree(rwq_ind_tbl);
-	return ERR_PTR(err);
+	return err;
 }
 
 int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
@@ -6275,10 +5196,7 @@
 	struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = to_mrwq_ind_table(ib_rwq_ind_tbl);
 	struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_tbl->device);
 
-	mlx5_cmd_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn, rwq_ind_tbl->uid);
-
-	kfree(rwq_ind_tbl);
-	return 0;
+	return mlx5_cmd_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn, rwq_ind_tbl->uid);
 }
 
 int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
@@ -6295,7 +5213,7 @@
 	void *rqc;
 	void *in;
 
-	required_cmd_sz = offsetof(typeof(ucmd), reserved) + sizeof(ucmd.reserved);
+	required_cmd_sz = offsetofend(struct mlx5_ib_modify_wq, reserved);
 	if (udata->inlen < required_cmd_sz)
 		return -EINVAL;
 
@@ -6363,7 +5281,7 @@
 				"Receive WQ counters are not supported on current FW\n");
 	}
 
-	err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in, inlen);
+	err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in);
 	if (!err)
 		rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state;
 
@@ -6462,7 +5380,7 @@
 	sdrain.cqe.done = mlx5_ib_drain_qp_done;
 	init_completion(&sdrain.done);
 
-	ret = _mlx5_ib_post_send(qp, &swr.wr, &bad_swr, true);
+	ret = mlx5_ib_post_send_drain(qp, &swr.wr, &bad_swr);
 	if (ret) {
 		WARN_ONCE(ret, "failed to drain send queue: %d\n", ret);
 		return;
@@ -6492,7 +5410,7 @@
 	rdrain.cqe.done = mlx5_ib_drain_qp_done;
 	init_completion(&rdrain.done);
 
-	ret = _mlx5_ib_post_recv(qp, &rwr, &bad_rwr, true);
+	ret = mlx5_ib_post_recv_drain(qp, &rwr, &bad_rwr);
 	if (ret) {
 		WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret);
 		return;
diff --git a/drivers/infiniband/hw/mlx5/qp.h b/drivers/infiniband/hw/mlx5/qp.h
new file mode 100644
index 0000000..5d4e140
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/qp.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
+ */
+
+#ifndef _MLX5_IB_QP_H
+#define _MLX5_IB_QP_H
+
+#include "mlx5_ib.h"
+
+int mlx5_init_qp_table(struct mlx5_ib_dev *dev);
+void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev);
+
+int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *qp,
+			 u32 *in, int inlen, u32 *out, int outlen);
+int mlx5_qpc_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp,
+		       u32 *in, int inlen, u32 *out);
+int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask,
+			void *qpc, struct mlx5_core_qp *qp, u32 *ece);
+int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp);
+int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct);
+int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp,
+		       u32 *out, int outlen);
+int mlx5_core_dct_query(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct,
+			u32 *out, int outlen);
+
+int mlx5_core_set_delay_drop(struct mlx5_ib_dev *dev, u32 timeout_usec);
+
+int mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev,
+				 struct mlx5_core_qp *rq);
+int mlx5_core_create_sq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen,
+				struct mlx5_core_qp *sq);
+void mlx5_core_destroy_sq_tracked(struct mlx5_ib_dev *dev,
+				  struct mlx5_core_qp *sq);
+
+int mlx5_core_create_rq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen,
+				struct mlx5_core_qp *rq);
+
+struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_ib_dev *dev,
+						int res_num,
+						enum mlx5_res_type res_type);
+void mlx5_core_res_put(struct mlx5_core_rsc_common *res);
+
+int mlx5_core_xrcd_alloc(struct mlx5_ib_dev *dev, u32 *xrcdn);
+int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn);
+int mlx5_ib_qp_set_counter(struct ib_qp *qp, struct rdma_counter *counter);
+#endif /* _MLX5_IB_QP_H */
diff --git a/drivers/infiniband/hw/mlx5/qpc.c b/drivers/infiniband/hw/mlx5/qpc.c
new file mode 100644
index 0000000..c683d70
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/qpc.c
@@ -0,0 +1,642 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2013-2020, Mellanox Technologies inc. All rights reserved.
+ */
+
+#include <linux/gfp.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_ib.h"
+#include "qp.h"
+
+static int mlx5_core_drain_dct(struct mlx5_ib_dev *dev,
+			       struct mlx5_core_dct *dct);
+
+static struct mlx5_core_rsc_common *
+mlx5_get_rsc(struct mlx5_qp_table *table, u32 rsn)
+{
+	struct mlx5_core_rsc_common *common;
+	unsigned long flags;
+
+	spin_lock_irqsave(&table->lock, flags);
+
+	common = radix_tree_lookup(&table->tree, rsn);
+	if (common)
+		refcount_inc(&common->refcount);
+
+	spin_unlock_irqrestore(&table->lock, flags);
+
+	return common;
+}
+
+void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common)
+{
+	if (refcount_dec_and_test(&common->refcount))
+		complete(&common->free);
+}
+
+static u64 qp_allowed_event_types(void)
+{
+	u64 mask;
+
+	mask = BIT(MLX5_EVENT_TYPE_PATH_MIG) |
+	       BIT(MLX5_EVENT_TYPE_COMM_EST) |
+	       BIT(MLX5_EVENT_TYPE_SQ_DRAINED) |
+	       BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) |
+	       BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR) |
+	       BIT(MLX5_EVENT_TYPE_PATH_MIG_FAILED) |
+	       BIT(MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) |
+	       BIT(MLX5_EVENT_TYPE_WQ_ACCESS_ERROR);
+
+	return mask;
+}
+
+static u64 rq_allowed_event_types(void)
+{
+	u64 mask;
+
+	mask = BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) |
+	       BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR);
+
+	return mask;
+}
+
+static u64 sq_allowed_event_types(void)
+{
+	return BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR);
+}
+
+static u64 dct_allowed_event_types(void)
+{
+	return BIT(MLX5_EVENT_TYPE_DCT_DRAINED);
+}
+
+static bool is_event_type_allowed(int rsc_type, int event_type)
+{
+	switch (rsc_type) {
+	case MLX5_EVENT_QUEUE_TYPE_QP:
+		return BIT(event_type) & qp_allowed_event_types();
+	case MLX5_EVENT_QUEUE_TYPE_RQ:
+		return BIT(event_type) & rq_allowed_event_types();
+	case MLX5_EVENT_QUEUE_TYPE_SQ:
+		return BIT(event_type) & sq_allowed_event_types();
+	case MLX5_EVENT_QUEUE_TYPE_DCT:
+		return BIT(event_type) & dct_allowed_event_types();
+	default:
+		WARN(1, "Event arrived for unknown resource type");
+		return false;
+	}
+}
+
+static int rsc_event_notifier(struct notifier_block *nb,
+			      unsigned long type, void *data)
+{
+	struct mlx5_core_rsc_common *common;
+	struct mlx5_qp_table *table;
+	struct mlx5_core_dct *dct;
+	u8 event_type = (u8)type;
+	struct mlx5_core_qp *qp;
+	struct mlx5_eqe *eqe;
+	u32 rsn;
+
+	switch (event_type) {
+	case MLX5_EVENT_TYPE_DCT_DRAINED:
+		eqe = data;
+		rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff;
+		rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN);
+		break;
+	case MLX5_EVENT_TYPE_PATH_MIG:
+	case MLX5_EVENT_TYPE_COMM_EST:
+	case MLX5_EVENT_TYPE_SQ_DRAINED:
+	case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+	case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+		eqe = data;
+		rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+		rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN);
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	table = container_of(nb, struct mlx5_qp_table, nb);
+	common = mlx5_get_rsc(table, rsn);
+	if (!common)
+		return NOTIFY_OK;
+
+	if (!is_event_type_allowed((rsn >> MLX5_USER_INDEX_LEN), event_type))
+		goto out;
+
+	switch (common->res) {
+	case MLX5_RES_QP:
+	case MLX5_RES_RQ:
+	case MLX5_RES_SQ:
+		qp = (struct mlx5_core_qp *)common;
+		qp->event(qp, event_type);
+		break;
+	case MLX5_RES_DCT:
+		dct = (struct mlx5_core_dct *)common;
+		if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED)
+			complete(&dct->drained);
+		break;
+	default:
+		break;
+	}
+out:
+	mlx5_core_put_rsc(common);
+
+	return NOTIFY_OK;
+}
+
+static int create_resource_common(struct mlx5_ib_dev *dev,
+				  struct mlx5_core_qp *qp, int rsc_type)
+{
+	struct mlx5_qp_table *table = &dev->qp_table;
+	int err;
+
+	qp->common.res = rsc_type;
+	spin_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree,
+				qp->qpn | (rsc_type << MLX5_USER_INDEX_LEN),
+				qp);
+	spin_unlock_irq(&table->lock);
+	if (err)
+		return err;
+
+	refcount_set(&qp->common.refcount, 1);
+	init_completion(&qp->common.free);
+	qp->pid = current->pid;
+
+	return 0;
+}
+
+static void destroy_resource_common(struct mlx5_ib_dev *dev,
+				    struct mlx5_core_qp *qp)
+{
+	struct mlx5_qp_table *table = &dev->qp_table;
+	unsigned long flags;
+
+	spin_lock_irqsave(&table->lock, flags);
+	radix_tree_delete(&table->tree,
+			  qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN));
+	spin_unlock_irqrestore(&table->lock, flags);
+	mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp);
+	wait_for_completion(&qp->common.free);
+}
+
+static int _mlx5_core_destroy_dct(struct mlx5_ib_dev *dev,
+				  struct mlx5_core_dct *dct, bool need_cleanup)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {};
+	struct mlx5_core_qp *qp = &dct->mqp;
+	int err;
+
+	err = mlx5_core_drain_dct(dev, dct);
+	if (err) {
+		if (dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
+			goto destroy;
+
+		return err;
+	}
+	wait_for_completion(&dct->drained);
+destroy:
+	if (need_cleanup)
+		destroy_resource_common(dev, &dct->mqp);
+	MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT);
+	MLX5_SET(destroy_dct_in, in, dctn, qp->qpn);
+	MLX5_SET(destroy_dct_in, in, uid, qp->uid);
+	err = mlx5_cmd_exec_in(dev->mdev, destroy_dct, in);
+	return err;
+}
+
+int mlx5_core_create_dct(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct,
+			 u32 *in, int inlen, u32 *out, int outlen)
+{
+	struct mlx5_core_qp *qp = &dct->mqp;
+	int err;
+
+	init_completion(&dct->drained);
+	MLX5_SET(create_dct_in, in, opcode, MLX5_CMD_OP_CREATE_DCT);
+
+	err = mlx5_cmd_exec(dev->mdev, in, inlen, out, outlen);
+	if (err)
+		return err;
+
+	qp->qpn = MLX5_GET(create_dct_out, out, dctn);
+	qp->uid = MLX5_GET(create_dct_in, in, uid);
+	err = create_resource_common(dev, qp, MLX5_RES_DCT);
+	if (err)
+		goto err_cmd;
+
+	return 0;
+err_cmd:
+	_mlx5_core_destroy_dct(dev, dct, false);
+	return err;
+}
+
+int mlx5_qpc_create_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp,
+		       u32 *in, int inlen, u32 *out)
+{
+	u32 din[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
+	int err;
+
+	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
+
+	err = mlx5_cmd_exec(dev->mdev, in, inlen, out,
+			    MLX5_ST_SZ_BYTES(create_qp_out));
+	if (err)
+		return err;
+
+	qp->uid = MLX5_GET(create_qp_in, in, uid);
+	qp->qpn = MLX5_GET(create_qp_out, out, qpn);
+
+	err = create_resource_common(dev, qp, MLX5_RES_QP);
+	if (err)
+		goto err_cmd;
+
+	mlx5_debug_qp_add(dev->mdev, qp);
+
+	return 0;
+
+err_cmd:
+	MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP);
+	MLX5_SET(destroy_qp_in, din, qpn, qp->qpn);
+	MLX5_SET(destroy_qp_in, din, uid, qp->uid);
+	mlx5_cmd_exec_in(dev->mdev, destroy_qp, din);
+	return err;
+}
+
+static int mlx5_core_drain_dct(struct mlx5_ib_dev *dev,
+			       struct mlx5_core_dct *dct)
+{
+	u32 in[MLX5_ST_SZ_DW(drain_dct_in)] = {};
+	struct mlx5_core_qp *qp = &dct->mqp;
+
+	MLX5_SET(drain_dct_in, in, opcode, MLX5_CMD_OP_DRAIN_DCT);
+	MLX5_SET(drain_dct_in, in, dctn, qp->qpn);
+	MLX5_SET(drain_dct_in, in, uid, qp->uid);
+	return mlx5_cmd_exec_in(dev->mdev, drain_dct, in);
+}
+
+int mlx5_core_destroy_dct(struct mlx5_ib_dev *dev,
+			  struct mlx5_core_dct *dct)
+{
+	return _mlx5_core_destroy_dct(dev, dct, true);
+}
+
+int mlx5_core_destroy_qp(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
+
+	mlx5_debug_qp_remove(dev->mdev, qp);
+
+	destroy_resource_common(dev, qp);
+
+	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
+	MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
+	MLX5_SET(destroy_qp_in, in, uid, qp->uid);
+	mlx5_cmd_exec_in(dev->mdev, destroy_qp, in);
+	return 0;
+}
+
+int mlx5_core_set_delay_drop(struct mlx5_ib_dev *dev,
+			     u32 timeout_usec)
+{
+	u32 in[MLX5_ST_SZ_DW(set_delay_drop_params_in)] = {};
+
+	MLX5_SET(set_delay_drop_params_in, in, opcode,
+		 MLX5_CMD_OP_SET_DELAY_DROP_PARAMS);
+	MLX5_SET(set_delay_drop_params_in, in, delay_drop_timeout,
+		 timeout_usec / 100);
+	return mlx5_cmd_exec_in(dev->mdev, set_delay_drop_params, in);
+}
+
+struct mbox_info {
+	u32 *in;
+	u32 *out;
+	int inlen;
+	int outlen;
+};
+
+static int mbox_alloc(struct mbox_info *mbox, int inlen, int outlen)
+{
+	mbox->inlen  = inlen;
+	mbox->outlen = outlen;
+	mbox->in = kzalloc(mbox->inlen, GFP_KERNEL);
+	mbox->out = kzalloc(mbox->outlen, GFP_KERNEL);
+	if (!mbox->in || !mbox->out) {
+		kfree(mbox->in);
+		kfree(mbox->out);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void mbox_free(struct mbox_info *mbox)
+{
+	kfree(mbox->in);
+	kfree(mbox->out);
+}
+
+static int get_ece_from_mbox(void *out, u16 opcode)
+{
+	int ece = 0;
+
+	switch (opcode) {
+	case MLX5_CMD_OP_INIT2INIT_QP:
+		ece = MLX5_GET(init2init_qp_out, out, ece);
+		break;
+	case MLX5_CMD_OP_INIT2RTR_QP:
+		ece = MLX5_GET(init2rtr_qp_out, out, ece);
+		break;
+	case MLX5_CMD_OP_RTR2RTS_QP:
+		ece = MLX5_GET(rtr2rts_qp_out, out, ece);
+		break;
+	case MLX5_CMD_OP_RTS2RTS_QP:
+		ece = MLX5_GET(rts2rts_qp_out, out, ece);
+		break;
+	case MLX5_CMD_OP_RST2INIT_QP:
+		ece = MLX5_GET(rst2init_qp_out, out, ece);
+		break;
+	default:
+		break;
+	}
+
+	return ece;
+}
+
+static int modify_qp_mbox_alloc(struct mlx5_core_dev *dev, u16 opcode, int qpn,
+				u32 opt_param_mask, void *qpc,
+				struct mbox_info *mbox, u16 uid, u32 ece)
+{
+	mbox->out = NULL;
+	mbox->in = NULL;
+
+#define MBOX_ALLOC(mbox, typ)  \
+	mbox_alloc(mbox, MLX5_ST_SZ_BYTES(typ##_in), MLX5_ST_SZ_BYTES(typ##_out))
+
+#define MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid)                            \
+	do {                                                                   \
+		MLX5_SET(typ##_in, in, opcode, _opcode);                       \
+		MLX5_SET(typ##_in, in, qpn, _qpn);                             \
+		MLX5_SET(typ##_in, in, uid, _uid);                             \
+	} while (0)
+
+#define MOD_QP_IN_SET_QPC(typ, in, _opcode, _qpn, _opt_p, _qpc, _uid)          \
+	do {                                                                   \
+		MOD_QP_IN_SET(typ, in, _opcode, _qpn, _uid);                   \
+		MLX5_SET(typ##_in, in, opt_param_mask, _opt_p);                \
+		memcpy(MLX5_ADDR_OF(typ##_in, in, qpc), _qpc,                  \
+		       MLX5_ST_SZ_BYTES(qpc));                                 \
+	} while (0)
+
+	switch (opcode) {
+	/* 2RST & 2ERR */
+	case MLX5_CMD_OP_2RST_QP:
+		if (MBOX_ALLOC(mbox, qp_2rst))
+			return -ENOMEM;
+		MOD_QP_IN_SET(qp_2rst, mbox->in, opcode, qpn, uid);
+		break;
+	case MLX5_CMD_OP_2ERR_QP:
+		if (MBOX_ALLOC(mbox, qp_2err))
+			return -ENOMEM;
+		MOD_QP_IN_SET(qp_2err, mbox->in, opcode, qpn, uid);
+		break;
+
+	/* MODIFY with QPC */
+	case MLX5_CMD_OP_RST2INIT_QP:
+		if (MBOX_ALLOC(mbox, rst2init_qp))
+			return -ENOMEM;
+		MOD_QP_IN_SET_QPC(rst2init_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc, uid);
+		MLX5_SET(rst2init_qp_in, mbox->in, ece, ece);
+		break;
+	case MLX5_CMD_OP_INIT2RTR_QP:
+		if (MBOX_ALLOC(mbox, init2rtr_qp))
+			return -ENOMEM;
+		MOD_QP_IN_SET_QPC(init2rtr_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc, uid);
+		MLX5_SET(init2rtr_qp_in, mbox->in, ece, ece);
+		break;
+	case MLX5_CMD_OP_RTR2RTS_QP:
+		if (MBOX_ALLOC(mbox, rtr2rts_qp))
+			return -ENOMEM;
+		MOD_QP_IN_SET_QPC(rtr2rts_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc, uid);
+		MLX5_SET(rtr2rts_qp_in, mbox->in, ece, ece);
+		break;
+	case MLX5_CMD_OP_RTS2RTS_QP:
+		if (MBOX_ALLOC(mbox, rts2rts_qp))
+			return -ENOMEM;
+		MOD_QP_IN_SET_QPC(rts2rts_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc, uid);
+		MLX5_SET(rts2rts_qp_in, mbox->in, ece, ece);
+		break;
+	case MLX5_CMD_OP_SQERR2RTS_QP:
+		if (MBOX_ALLOC(mbox, sqerr2rts_qp))
+			return -ENOMEM;
+		MOD_QP_IN_SET_QPC(sqerr2rts_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc, uid);
+		break;
+	case MLX5_CMD_OP_INIT2INIT_QP:
+		if (MBOX_ALLOC(mbox, init2init_qp))
+			return -ENOMEM;
+		MOD_QP_IN_SET_QPC(init2init_qp, mbox->in, opcode, qpn,
+				  opt_param_mask, qpc, uid);
+		MLX5_SET(init2init_qp_in, mbox->in, ece, ece);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int mlx5_core_qp_modify(struct mlx5_ib_dev *dev, u16 opcode, u32 opt_param_mask,
+			void *qpc, struct mlx5_core_qp *qp, u32 *ece)
+{
+	struct mbox_info mbox;
+	int err;
+
+	err = modify_qp_mbox_alloc(dev->mdev, opcode, qp->qpn, opt_param_mask,
+				   qpc, &mbox, qp->uid, (ece) ? *ece : 0);
+	if (err)
+		return err;
+
+	err = mlx5_cmd_exec(dev->mdev, mbox.in, mbox.inlen, mbox.out,
+			    mbox.outlen);
+
+	if (ece)
+		*ece = get_ece_from_mbox(mbox.out, opcode);
+
+	mbox_free(&mbox);
+	return err;
+}
+
+int mlx5_init_qp_table(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_qp_table *table = &dev->qp_table;
+
+	spin_lock_init(&table->lock);
+	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+	mlx5_qp_debugfs_init(dev->mdev);
+
+	table->nb.notifier_call = rsc_event_notifier;
+	mlx5_notifier_register(dev->mdev, &table->nb);
+
+	return 0;
+}
+
+void mlx5_cleanup_qp_table(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_qp_table *table = &dev->qp_table;
+
+	mlx5_notifier_unregister(dev->mdev, &table->nb);
+	mlx5_qp_debugfs_cleanup(dev->mdev);
+}
+
+int mlx5_core_qp_query(struct mlx5_ib_dev *dev, struct mlx5_core_qp *qp,
+		       u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_qp_in)] = {};
+
+	MLX5_SET(query_qp_in, in, opcode, MLX5_CMD_OP_QUERY_QP);
+	MLX5_SET(query_qp_in, in, qpn, qp->qpn);
+	return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, outlen);
+}
+
+int mlx5_core_dct_query(struct mlx5_ib_dev *dev, struct mlx5_core_dct *dct,
+			u32 *out, int outlen)
+{
+	u32 in[MLX5_ST_SZ_DW(query_dct_in)] = {};
+	struct mlx5_core_qp *qp = &dct->mqp;
+
+	MLX5_SET(query_dct_in, in, opcode, MLX5_CMD_OP_QUERY_DCT);
+	MLX5_SET(query_dct_in, in, dctn, qp->qpn);
+
+	return mlx5_cmd_exec(dev->mdev, (void *)&in, sizeof(in), (void *)out,
+			     outlen);
+}
+
+int mlx5_core_xrcd_alloc(struct mlx5_ib_dev *dev, u32 *xrcdn)
+{
+	u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)] = {};
+	int err;
+
+	MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD);
+	err = mlx5_cmd_exec_inout(dev->mdev, alloc_xrcd, in, out);
+	if (!err)
+		*xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd);
+	return err;
+}
+
+int mlx5_core_xrcd_dealloc(struct mlx5_ib_dev *dev, u32 xrcdn)
+{
+	u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)] = {};
+
+	MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD);
+	MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn);
+	return mlx5_cmd_exec_in(dev->mdev, dealloc_xrcd, in);
+}
+
+static void destroy_rq_tracked(struct mlx5_ib_dev *dev, u32 rqn, u16 uid)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_rq_in)] = {};
+
+	MLX5_SET(destroy_rq_in, in, opcode, MLX5_CMD_OP_DESTROY_RQ);
+	MLX5_SET(destroy_rq_in, in, rqn, rqn);
+	MLX5_SET(destroy_rq_in, in, uid, uid);
+	mlx5_cmd_exec_in(dev->mdev, destroy_rq, in);
+}
+
+int mlx5_core_create_rq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen,
+				struct mlx5_core_qp *rq)
+{
+	int err;
+	u32 rqn;
+
+	err = mlx5_core_create_rq(dev->mdev, in, inlen, &rqn);
+	if (err)
+		return err;
+
+	rq->uid = MLX5_GET(create_rq_in, in, uid);
+	rq->qpn = rqn;
+	err = create_resource_common(dev, rq, MLX5_RES_RQ);
+	if (err)
+		goto err_destroy_rq;
+
+	return 0;
+
+err_destroy_rq:
+	destroy_rq_tracked(dev, rq->qpn, rq->uid);
+
+	return err;
+}
+
+int mlx5_core_destroy_rq_tracked(struct mlx5_ib_dev *dev,
+				 struct mlx5_core_qp *rq)
+{
+	destroy_resource_common(dev, rq);
+	destroy_rq_tracked(dev, rq->qpn, rq->uid);
+	return 0;
+}
+
+static void destroy_sq_tracked(struct mlx5_ib_dev *dev, u32 sqn, u16 uid)
+{
+	u32 in[MLX5_ST_SZ_DW(destroy_sq_in)] = {};
+
+	MLX5_SET(destroy_sq_in, in, opcode, MLX5_CMD_OP_DESTROY_SQ);
+	MLX5_SET(destroy_sq_in, in, sqn, sqn);
+	MLX5_SET(destroy_sq_in, in, uid, uid);
+	mlx5_cmd_exec_in(dev->mdev, destroy_sq, in);
+}
+
+int mlx5_core_create_sq_tracked(struct mlx5_ib_dev *dev, u32 *in, int inlen,
+				struct mlx5_core_qp *sq)
+{
+	u32 out[MLX5_ST_SZ_DW(create_sq_out)] = {};
+	int err;
+
+	MLX5_SET(create_sq_in, in, opcode, MLX5_CMD_OP_CREATE_SQ);
+	err = mlx5_cmd_exec(dev->mdev, in, inlen, out, sizeof(out));
+	if (err)
+		return err;
+
+	sq->qpn = MLX5_GET(create_sq_out, out, sqn);
+	sq->uid = MLX5_GET(create_sq_in, in, uid);
+	err = create_resource_common(dev, sq, MLX5_RES_SQ);
+	if (err)
+		goto err_destroy_sq;
+
+	return 0;
+
+err_destroy_sq:
+	destroy_sq_tracked(dev, sq->qpn, sq->uid);
+
+	return err;
+}
+
+void mlx5_core_destroy_sq_tracked(struct mlx5_ib_dev *dev,
+				  struct mlx5_core_qp *sq)
+{
+	destroy_resource_common(dev, sq);
+	destroy_sq_tracked(dev, sq->qpn, sq->uid);
+}
+
+struct mlx5_core_rsc_common *mlx5_core_res_hold(struct mlx5_ib_dev *dev,
+						int res_num,
+						enum mlx5_res_type res_type)
+{
+	u32 rsn = res_num | (res_type << MLX5_USER_INDEX_LEN);
+	struct mlx5_qp_table *table = &dev->qp_table;
+
+	return mlx5_get_rsc(table, rsn);
+}
+
+void mlx5_core_res_put(struct mlx5_core_rsc_common *res)
+{
+	mlx5_core_put_rsc(res);
+}
diff --git a/drivers/infiniband/hw/mlx5/restrack.c b/drivers/infiniband/hw/mlx5/restrack.c
new file mode 100644
index 0000000..887270d
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/restrack.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019-2020, Mellanox Technologies Ltd. All rights reserved.
+ */
+
+#include <uapi/rdma/rdma_netlink.h>
+#include <linux/mlx5/rsc_dump.h>
+#include <rdma/ib_umem_odp.h>
+#include <rdma/restrack.h>
+#include "mlx5_ib.h"
+#include "restrack.h"
+
+#define MAX_DUMP_SIZE 1024
+
+static int dump_rsc(struct mlx5_core_dev *dev, enum mlx5_sgmt_type type,
+		    int index, void *data, int *data_len)
+{
+	struct mlx5_core_dev *mdev = dev;
+	struct mlx5_rsc_dump_cmd *cmd;
+	struct mlx5_rsc_key key = {};
+	struct page *page;
+	int offset = 0;
+	int err = 0;
+	int cmd_err;
+	int size;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	key.size = PAGE_SIZE;
+	key.rsc = type;
+	key.index1 = index;
+	key.num_of_obj1 = 1;
+
+	cmd = mlx5_rsc_dump_cmd_create(mdev, &key);
+	if (IS_ERR(cmd)) {
+		err = PTR_ERR(cmd);
+		goto free_page;
+	}
+
+	do {
+		cmd_err = mlx5_rsc_dump_next(mdev, cmd, page, &size);
+		if (cmd_err < 0 || size + offset > MAX_DUMP_SIZE) {
+			err = cmd_err;
+			goto destroy_cmd;
+		}
+		memcpy(data + offset, page_address(page), size);
+		offset += size;
+	} while (cmd_err > 0);
+	*data_len = offset;
+
+destroy_cmd:
+	mlx5_rsc_dump_cmd_destroy(cmd);
+free_page:
+	__free_page(page);
+	return err;
+}
+
+static int fill_res_raw(struct sk_buff *msg, struct mlx5_ib_dev *dev,
+			enum mlx5_sgmt_type type, u32 key)
+{
+	int len = 0;
+	void *data;
+	int err;
+
+	data = kzalloc(MAX_DUMP_SIZE, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	err = dump_rsc(dev->mdev, type, key, data, &len);
+	if (err)
+		goto out;
+
+	err = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, len, data);
+out:
+	kfree(data);
+	return err;
+}
+
+static int fill_stat_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr)
+{
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	struct nlattr *table_attr;
+
+	if (!(mr->access_flags & IB_ACCESS_ON_DEMAND))
+		return 0;
+
+	table_attr = nla_nest_start(msg,
+				    RDMA_NLDEV_ATTR_STAT_HWCOUNTERS);
+
+	if (!table_attr)
+		goto err;
+
+	if (rdma_nl_stat_hwcounter_entry(msg, "page_faults",
+					 atomic64_read(&mr->odp_stats.faults)))
+		goto err_table;
+	if (rdma_nl_stat_hwcounter_entry(
+		    msg, "page_invalidations",
+		    atomic64_read(&mr->odp_stats.invalidations)))
+		goto err_table;
+	if (rdma_nl_stat_hwcounter_entry(msg, "page_prefetch",
+					 atomic64_read(&mr->odp_stats.prefetch)))
+		goto err_table;
+
+	nla_nest_end(msg, table_attr);
+	return 0;
+
+err_table:
+	nla_nest_cancel(msg, table_attr);
+err:
+	return -EMSGSIZE;
+}
+
+static int fill_res_mr_entry_raw(struct sk_buff *msg, struct ib_mr *ibmr)
+{
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+
+	return fill_res_raw(msg, mr->dev, MLX5_SGMT_TYPE_PRM_QUERY_MKEY,
+			    mlx5_mkey_to_idx(mr->mmkey.key));
+}
+
+static int fill_res_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr)
+{
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	struct nlattr *table_attr;
+
+	if (!(mr->access_flags & IB_ACCESS_ON_DEMAND))
+		return 0;
+
+	table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER);
+	if (!table_attr)
+		goto err;
+
+	if (mr->is_odp_implicit) {
+		if (rdma_nl_put_driver_string(msg, "odp", "implicit"))
+			goto err;
+	} else {
+		if (rdma_nl_put_driver_string(msg, "odp", "explicit"))
+			goto err;
+	}
+
+	nla_nest_end(msg, table_attr);
+	return 0;
+
+err:
+	nla_nest_cancel(msg, table_attr);
+	return -EMSGSIZE;
+}
+
+static int fill_res_cq_entry_raw(struct sk_buff *msg, struct ib_cq *ibcq)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibcq->device);
+	struct mlx5_ib_cq *cq = to_mcq(ibcq);
+
+	return fill_res_raw(msg, dev, MLX5_SGMT_TYPE_PRM_QUERY_CQ, cq->mcq.cqn);
+}
+
+static int fill_res_qp_entry_raw(struct sk_buff *msg, struct ib_qp *ibqp)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+
+	return fill_res_raw(msg, dev, MLX5_SGMT_TYPE_PRM_QUERY_QP,
+			    ibqp->qp_num);
+}
+
+static const struct ib_device_ops restrack_ops = {
+	.fill_res_cq_entry_raw = fill_res_cq_entry_raw,
+	.fill_res_mr_entry = fill_res_mr_entry,
+	.fill_res_mr_entry_raw = fill_res_mr_entry_raw,
+	.fill_res_qp_entry_raw = fill_res_qp_entry_raw,
+	.fill_stat_mr_entry = fill_stat_mr_entry,
+};
+
+int mlx5_ib_restrack_init(struct mlx5_ib_dev *dev)
+{
+	ib_set_device_ops(&dev->ib_dev, &restrack_ops);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/restrack.h b/drivers/infiniband/hw/mlx5/restrack.h
new file mode 100644
index 0000000..e8d8127
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/restrack.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2013-2020, Mellanox Technologies Ltd. All rights reserved.
+ */
+
+#ifndef _MLX5_IB_RESTRACK_H
+#define _MLX5_IB_RESTRACK_H
+
+#include "mlx5_ib.h"
+
+int mlx5_ib_restrack_init(struct mlx5_ib_dev *dev);
+
+#endif /* _MLX5_IB_RESTRACK_H */
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
index c29c1f7..e2f720e 100644
--- a/drivers/infiniband/hw/mlx5/srq.c
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -80,7 +80,7 @@
 
 	srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
 
-	srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0, 0);
+	srq->umem = ib_umem_get(pd->device, ucmd.buf_addr, buf_size, 0);
 	if (IS_ERR(srq->umem)) {
 		mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size);
 		err = PTR_ERR(srq->umem);
@@ -274,10 +274,10 @@
 	if (srq->wq_sig)
 		in.flags |= MLX5_SRQ_FLAG_WQ_SIG;
 
-	if (init_attr->srq_type == IB_SRQT_XRC)
+	if (init_attr->srq_type == IB_SRQT_XRC && init_attr->ext.xrc.xrcd)
 		in.xrcd = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn;
 	else
-		in.xrcd = to_mxrcd(dev->devr.x0)->xrcdn;
+		in.xrcd = dev->devr.xrcdn0;
 
 	if (init_attr->srq_type == IB_SRQT_TM) {
 		in.tm_log_list_size =
@@ -389,24 +389,21 @@
 	return ret;
 }
 
-void mlx5_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
+int mlx5_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
 {
 	struct mlx5_ib_dev *dev = to_mdev(srq->device);
 	struct mlx5_ib_srq *msrq = to_msrq(srq);
+	int ret;
 
-	mlx5_cmd_destroy_srq(dev, &msrq->msrq);
+	ret = mlx5_cmd_destroy_srq(dev, &msrq->msrq);
+	if (ret)
+		return ret;
 
-	if (srq->uobject) {
-		mlx5_ib_db_unmap_user(
-			rdma_udata_to_drv_context(
-				udata,
-				struct mlx5_ib_ucontext,
-				ibucontext),
-			&msrq->db);
-		ib_umem_release(msrq->umem);
-	} else {
+	if (udata)
+		destroy_srq_user(srq->pd, msrq, udata);
+	else
 		destroy_srq_kernel(dev, msrq);
-	}
+	return 0;
 }
 
 void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index)
diff --git a/drivers/infiniband/hw/mlx5/srq.h b/drivers/infiniband/hw/mlx5/srq.h
index af197c3..2c3627b 100644
--- a/drivers/infiniband/hw/mlx5/srq.h
+++ b/drivers/infiniband/hw/mlx5/srq.h
@@ -56,7 +56,7 @@
 
 int mlx5_cmd_create_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
 			struct mlx5_srq_attr *in);
-void mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq);
+int mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq);
 int mlx5_cmd_query_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
 		       struct mlx5_srq_attr *out);
 int mlx5_cmd_arm_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
diff --git a/drivers/infiniband/hw/mlx5/srq_cmd.c b/drivers/infiniband/hw/mlx5/srq_cmd.c
index 0224231..db889ec 100644
--- a/drivers/infiniband/hw/mlx5/srq_cmd.c
+++ b/drivers/infiniband/hw/mlx5/srq_cmd.c
@@ -5,9 +5,9 @@
 
 #include <linux/kernel.h>
 #include <linux/mlx5/driver.h>
-#include <linux/mlx5/cmd.h>
 #include "mlx5_ib.h"
 #include "srq.h"
+#include "qp.h"
 
 static int get_pas_size(struct mlx5_srq_attr *in)
 {
@@ -132,38 +132,33 @@
 
 static int destroy_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
 {
-	u32 srq_in[MLX5_ST_SZ_DW(destroy_srq_in)] = {0};
-	u32 srq_out[MLX5_ST_SZ_DW(destroy_srq_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_srq_in)] = {};
 
-	MLX5_SET(destroy_srq_in, srq_in, opcode,
-		 MLX5_CMD_OP_DESTROY_SRQ);
-	MLX5_SET(destroy_srq_in, srq_in, srqn, srq->srqn);
-	MLX5_SET(destroy_srq_in, srq_in, uid, srq->uid);
+	MLX5_SET(destroy_srq_in, in, opcode, MLX5_CMD_OP_DESTROY_SRQ);
+	MLX5_SET(destroy_srq_in, in, srqn, srq->srqn);
+	MLX5_SET(destroy_srq_in, in, uid, srq->uid);
 
-	return mlx5_cmd_exec(dev->mdev, srq_in, sizeof(srq_in), srq_out,
-			     sizeof(srq_out));
+	return mlx5_cmd_exec_in(dev->mdev, destroy_srq, in);
 }
 
 static int arm_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
 		       u16 lwm, int is_srq)
 {
-	u32 srq_in[MLX5_ST_SZ_DW(arm_rq_in)] = {0};
-	u32 srq_out[MLX5_ST_SZ_DW(arm_rq_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(arm_rq_in)] = {};
 
-	MLX5_SET(arm_rq_in, srq_in, opcode, MLX5_CMD_OP_ARM_RQ);
-	MLX5_SET(arm_rq_in, srq_in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_SRQ);
-	MLX5_SET(arm_rq_in, srq_in, srq_number, srq->srqn);
-	MLX5_SET(arm_rq_in, srq_in, lwm,      lwm);
-	MLX5_SET(arm_rq_in, srq_in, uid, srq->uid);
+	MLX5_SET(arm_rq_in, in, opcode, MLX5_CMD_OP_ARM_RQ);
+	MLX5_SET(arm_rq_in, in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_SRQ);
+	MLX5_SET(arm_rq_in, in, srq_number, srq->srqn);
+	MLX5_SET(arm_rq_in, in, lwm, lwm);
+	MLX5_SET(arm_rq_in, in, uid, srq->uid);
 
-	return mlx5_cmd_exec(dev->mdev, srq_in, sizeof(srq_in), srq_out,
-			     sizeof(srq_out));
+	return mlx5_cmd_exec_in(dev->mdev, arm_rq, in);
 }
 
 static int query_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
 			 struct mlx5_srq_attr *out)
 {
-	u32 srq_in[MLX5_ST_SZ_DW(query_srq_in)] = {0};
+	u32 in[MLX5_ST_SZ_DW(query_srq_in)] = {};
 	u32 *srq_out;
 	void *srqc;
 	int err;
@@ -172,11 +167,9 @@
 	if (!srq_out)
 		return -ENOMEM;
 
-	MLX5_SET(query_srq_in, srq_in, opcode,
-		 MLX5_CMD_OP_QUERY_SRQ);
-	MLX5_SET(query_srq_in, srq_in, srqn, srq->srqn);
-	err = mlx5_cmd_exec(dev->mdev, srq_in, sizeof(srq_in), srq_out,
-			    MLX5_ST_SZ_BYTES(query_srq_out));
+	MLX5_SET(query_srq_in, in, opcode, MLX5_CMD_OP_QUERY_SRQ);
+	MLX5_SET(query_srq_in, in, srqn, srq->srqn);
+	err = mlx5_cmd_exec_inout(dev->mdev, query_srq, in, srq_out);
 	if (err)
 		goto out;
 
@@ -234,39 +227,35 @@
 static int destroy_xrc_srq_cmd(struct mlx5_ib_dev *dev,
 			       struct mlx5_core_srq *srq)
 {
-	u32 xrcsrq_in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)]   = {0};
-	u32 xrcsrq_out[MLX5_ST_SZ_DW(destroy_xrc_srq_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_xrc_srq_in)] = {};
 
-	MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, opcode,
-		 MLX5_CMD_OP_DESTROY_XRC_SRQ);
-	MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
-	MLX5_SET(destroy_xrc_srq_in, xrcsrq_in, uid, srq->uid);
+	MLX5_SET(destroy_xrc_srq_in, in, opcode, MLX5_CMD_OP_DESTROY_XRC_SRQ);
+	MLX5_SET(destroy_xrc_srq_in, in, xrc_srqn, srq->srqn);
+	MLX5_SET(destroy_xrc_srq_in, in, uid, srq->uid);
 
-	return mlx5_cmd_exec(dev->mdev, xrcsrq_in, sizeof(xrcsrq_in),
-			     xrcsrq_out, sizeof(xrcsrq_out));
+	return mlx5_cmd_exec_in(dev->mdev, destroy_xrc_srq, in);
 }
 
 static int arm_xrc_srq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
 			   u16 lwm)
 {
-	u32 xrcsrq_in[MLX5_ST_SZ_DW(arm_xrc_srq_in)]   = {0};
-	u32 xrcsrq_out[MLX5_ST_SZ_DW(arm_xrc_srq_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(arm_xrc_srq_in)] = {};
 
-	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, opcode,   MLX5_CMD_OP_ARM_XRC_SRQ);
-	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, op_mod,   MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ);
-	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
-	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, lwm,      lwm);
-	MLX5_SET(arm_xrc_srq_in, xrcsrq_in, uid, srq->uid);
+	MLX5_SET(arm_xrc_srq_in, in, opcode, MLX5_CMD_OP_ARM_XRC_SRQ);
+	MLX5_SET(arm_xrc_srq_in, in, op_mod,
+		 MLX5_ARM_XRC_SRQ_IN_OP_MOD_XRC_SRQ);
+	MLX5_SET(arm_xrc_srq_in, in, xrc_srqn, srq->srqn);
+	MLX5_SET(arm_xrc_srq_in, in, lwm, lwm);
+	MLX5_SET(arm_xrc_srq_in, in, uid, srq->uid);
 
-	return  mlx5_cmd_exec(dev->mdev, xrcsrq_in, sizeof(xrcsrq_in),
-			      xrcsrq_out, sizeof(xrcsrq_out));
+	return  mlx5_cmd_exec_in(dev->mdev, arm_xrc_srq, in);
 }
 
 static int query_xrc_srq_cmd(struct mlx5_ib_dev *dev,
 			     struct mlx5_core_srq *srq,
 			     struct mlx5_srq_attr *out)
 {
-	u32 xrcsrq_in[MLX5_ST_SZ_DW(query_xrc_srq_in)];
+	u32 in[MLX5_ST_SZ_DW(query_xrc_srq_in)] = {};
 	u32 *xrcsrq_out;
 	void *xrc_srqc;
 	int err;
@@ -274,14 +263,11 @@
 	xrcsrq_out = kvzalloc(MLX5_ST_SZ_BYTES(query_xrc_srq_out), GFP_KERNEL);
 	if (!xrcsrq_out)
 		return -ENOMEM;
-	memset(xrcsrq_in, 0, sizeof(xrcsrq_in));
 
-	MLX5_SET(query_xrc_srq_in, xrcsrq_in, opcode,
-		 MLX5_CMD_OP_QUERY_XRC_SRQ);
-	MLX5_SET(query_xrc_srq_in, xrcsrq_in, xrc_srqn, srq->srqn);
+	MLX5_SET(query_xrc_srq_in, in, opcode, MLX5_CMD_OP_QUERY_XRC_SRQ);
+	MLX5_SET(query_xrc_srq_in, in, xrc_srqn, srq->srqn);
 
-	err = mlx5_cmd_exec(dev->mdev, xrcsrq_in, sizeof(xrcsrq_in),
-			    xrcsrq_out, MLX5_ST_SZ_BYTES(query_xrc_srq_out));
+	err = mlx5_cmd_exec_inout(dev->mdev, query_xrc_srq, in, xrcsrq_out);
 	if (err)
 		goto out;
 
@@ -341,13 +327,12 @@
 
 static int destroy_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
 {
-	u32 in[MLX5_ST_SZ_DW(destroy_rmp_in)]   = {};
-	u32 out[MLX5_ST_SZ_DW(destroy_rmp_out)] = {};
+	u32 in[MLX5_ST_SZ_DW(destroy_rmp_in)] = {};
 
 	MLX5_SET(destroy_rmp_in, in, opcode, MLX5_CMD_OP_DESTROY_RMP);
 	MLX5_SET(destroy_rmp_in, in, rmpn, srq->srqn);
 	MLX5_SET(destroy_rmp_in, in, uid, srq->uid);
-	return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+	return mlx5_cmd_exec_in(dev->mdev, destroy_rmp, in);
 }
 
 static int arm_rmp_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
@@ -384,7 +369,7 @@
 	MLX5_SET(rmpc, rmpc, state, MLX5_RMPC_STATE_RDY);
 	MLX5_SET(modify_rmp_in, in, opcode, MLX5_CMD_OP_MODIFY_RMP);
 
-	err = mlx5_cmd_exec(dev->mdev, in, inlen, out, outlen);
+	err = mlx5_cmd_exec_inout(dev->mdev, modify_rmp, in, out);
 
 out:
 	kvfree(in);
@@ -414,7 +399,7 @@
 
 	MLX5_SET(query_rmp_in, rmp_in, opcode, MLX5_CMD_OP_QUERY_RMP);
 	MLX5_SET(query_rmp_in, rmp_in, rmpn,   srq->srqn);
-	err = mlx5_cmd_exec(dev->mdev, rmp_in, inlen, rmp_out, outlen);
+	err = mlx5_cmd_exec_inout(dev->mdev, query_rmp, rmp_in, rmp_out);
 	if (err)
 		goto out;
 
@@ -477,36 +462,34 @@
 
 static int destroy_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
 {
-	u32 in[MLX5_ST_SZ_DW(destroy_xrq_in)] = {0};
-	u32 out[MLX5_ST_SZ_DW(destroy_xrq_out)] = {0};
+	u32 in[MLX5_ST_SZ_DW(destroy_xrq_in)] = {};
 
 	MLX5_SET(destroy_xrq_in, in, opcode, MLX5_CMD_OP_DESTROY_XRQ);
-	MLX5_SET(destroy_xrq_in, in, xrqn,   srq->srqn);
+	MLX5_SET(destroy_xrq_in, in, xrqn, srq->srqn);
 	MLX5_SET(destroy_xrq_in, in, uid, srq->uid);
 
-	return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+	return mlx5_cmd_exec_in(dev->mdev, destroy_xrq, in);
 }
 
 static int arm_xrq_cmd(struct mlx5_ib_dev *dev,
 		       struct mlx5_core_srq *srq,
 		       u16 lwm)
 {
-	u32 out[MLX5_ST_SZ_DW(arm_rq_out)] = {0};
-	u32 in[MLX5_ST_SZ_DW(arm_rq_in)] = {0};
+	u32 in[MLX5_ST_SZ_DW(arm_rq_in)] = {};
 
-	MLX5_SET(arm_rq_in, in, opcode,     MLX5_CMD_OP_ARM_RQ);
-	MLX5_SET(arm_rq_in, in, op_mod,     MLX5_ARM_RQ_IN_OP_MOD_XRQ);
+	MLX5_SET(arm_rq_in, in, opcode, MLX5_CMD_OP_ARM_RQ);
+	MLX5_SET(arm_rq_in, in, op_mod, MLX5_ARM_RQ_IN_OP_MOD_XRQ);
 	MLX5_SET(arm_rq_in, in, srq_number, srq->srqn);
-	MLX5_SET(arm_rq_in, in, lwm,	    lwm);
+	MLX5_SET(arm_rq_in, in, lwm, lwm);
 	MLX5_SET(arm_rq_in, in, uid, srq->uid);
 
-	return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
+	return mlx5_cmd_exec_in(dev->mdev, arm_rq, in);
 }
 
 static int query_xrq_cmd(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
 			 struct mlx5_srq_attr *out)
 {
-	u32 in[MLX5_ST_SZ_DW(query_xrq_in)] = {0};
+	u32 in[MLX5_ST_SZ_DW(query_xrq_in)] = {};
 	u32 *xrq_out;
 	int outlen = MLX5_ST_SZ_BYTES(query_xrq_out);
 	void *xrqc;
@@ -519,7 +502,7 @@
 	MLX5_SET(query_xrq_in, in, opcode, MLX5_CMD_OP_QUERY_XRQ);
 	MLX5_SET(query_xrq_in, in, xrqn, srq->srqn);
 
-	err = mlx5_cmd_exec(dev->mdev, in, sizeof(in), xrq_out, outlen);
+	err = mlx5_cmd_exec_inout(dev->mdev, query_xrq, in, xrq_out);
 	if (err)
 		goto out;
 
@@ -607,22 +590,32 @@
 	return err;
 }
 
-void mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
+int mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
 {
 	struct mlx5_srq_table *table = &dev->srq_table;
 	struct mlx5_core_srq *tmp;
 	int err;
 
-	tmp = xa_erase_irq(&table->array, srq->srqn);
-	if (!tmp || tmp != srq)
-		return;
+	/* Delete entry, but leave index occupied */
+	tmp = xa_cmpxchg_irq(&table->array, srq->srqn, srq, XA_ZERO_ENTRY, 0);
+	if (WARN_ON(tmp != srq))
+		return xa_err(tmp) ?: -EINVAL;
 
 	err = destroy_srq_split(dev, srq);
-	if (err)
-		return;
+	if (err) {
+		/*
+		 * We don't need to check returned result for an error,
+		 * because  we are storing in pre-allocated space xarray
+		 * entry and it can't fail at this stage.
+		 */
+		xa_cmpxchg_irq(&table->array, srq->srqn, XA_ZERO_ENTRY, srq, 0);
+		return err;
+	}
+	xa_erase_irq(&table->array, srq->srqn);
 
 	mlx5_core_res_put(&srq->common);
 	wait_for_completion(&srq->common.free);
+	return 0;
 }
 
 int mlx5_cmd_query_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c
new file mode 100644
index 0000000..16145fd
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/std_types.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2020, Mellanox Technologies inc.  All rights reserved.
+ */
+
+#include <rdma/uverbs_ioctl.h>
+#include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/mlx5_user_ioctl_verbs.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_ib.h"
+
+#define UVERBS_MODULE_NAME mlx5_ib
+#include <rdma/uverbs_named_ioctl.h>
+
+static int UVERBS_HANDLER(MLX5_IB_METHOD_PD_QUERY)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct ib_pd *pd =
+		uverbs_attr_get_obj(attrs, MLX5_IB_ATTR_QUERY_PD_HANDLE);
+	struct mlx5_ib_pd *mpd = to_mpd(pd);
+
+	return uverbs_copy_to(attrs, MLX5_IB_ATTR_QUERY_PD_RESP_PDN,
+			      &mpd->pdn, sizeof(mpd->pdn));
+}
+
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_PD_QUERY,
+	UVERBS_ATTR_IDR(MLX5_IB_ATTR_QUERY_PD_HANDLE,
+			UVERBS_OBJECT_PD,
+			UVERBS_ACCESS_READ,
+			UA_MANDATORY),
+	UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_QUERY_PD_RESP_PDN,
+			   UVERBS_ATTR_TYPE(u32),
+			   UA_MANDATORY));
+
+ADD_UVERBS_METHODS(mlx5_ib_pd,
+		   UVERBS_OBJECT_PD,
+		   &UVERBS_METHOD(MLX5_IB_METHOD_PD_QUERY));
+
+const struct uapi_definition mlx5_ib_std_types_defs[] = {
+	UAPI_DEF_CHAIN_OBJ_TREE(
+		UVERBS_OBJECT_PD,
+		&mlx5_ib_pd),
+	{},
+};
diff --git a/drivers/infiniband/hw/mlx5/wr.c b/drivers/infiniband/hw/mlx5/wr.c
new file mode 100644
index 0000000..d6038fb
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/wr.c
@@ -0,0 +1,1537 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved.
+ */
+
+#include <linux/gfp.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/driver.h>
+#include "wr.h"
+
+static const u32 mlx5_ib_opcode[] = {
+	[IB_WR_SEND]				= MLX5_OPCODE_SEND,
+	[IB_WR_LSO]				= MLX5_OPCODE_LSO,
+	[IB_WR_SEND_WITH_IMM]			= MLX5_OPCODE_SEND_IMM,
+	[IB_WR_RDMA_WRITE]			= MLX5_OPCODE_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM]		= MLX5_OPCODE_RDMA_WRITE_IMM,
+	[IB_WR_RDMA_READ]			= MLX5_OPCODE_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP]		= MLX5_OPCODE_ATOMIC_CS,
+	[IB_WR_ATOMIC_FETCH_AND_ADD]		= MLX5_OPCODE_ATOMIC_FA,
+	[IB_WR_SEND_WITH_INV]			= MLX5_OPCODE_SEND_INVAL,
+	[IB_WR_LOCAL_INV]			= MLX5_OPCODE_UMR,
+	[IB_WR_REG_MR]				= MLX5_OPCODE_UMR,
+	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= MLX5_OPCODE_ATOMIC_MASKED_CS,
+	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= MLX5_OPCODE_ATOMIC_MASKED_FA,
+	[MLX5_IB_WR_UMR]			= MLX5_OPCODE_UMR,
+};
+
+/* handle_post_send_edge - Check if we get to SQ edge. If yes, update to the
+ * next nearby edge and get new address translation for current WQE position.
+ * @sq - SQ buffer.
+ * @seg: Current WQE position (16B aligned).
+ * @wqe_sz: Total current WQE size [16B].
+ * @cur_edge: Updated current edge.
+ */
+static inline void handle_post_send_edge(struct mlx5_ib_wq *sq, void **seg,
+					 u32 wqe_sz, void **cur_edge)
+{
+	u32 idx;
+
+	if (likely(*seg != *cur_edge))
+		return;
+
+	idx = (sq->cur_post + (wqe_sz >> 2)) & (sq->wqe_cnt - 1);
+	*cur_edge = get_sq_edge(sq, idx);
+
+	*seg = mlx5_frag_buf_get_wqe(&sq->fbc, idx);
+}
+
+/* memcpy_send_wqe - copy data from src to WQE and update the relevant WQ's
+ * pointers. At the end @seg is aligned to 16B regardless the copied size.
+ * @sq - SQ buffer.
+ * @cur_edge: Updated current edge.
+ * @seg: Current WQE position (16B aligned).
+ * @wqe_sz: Total current WQE size [16B].
+ * @src: Pointer to copy from.
+ * @n: Number of bytes to copy.
+ */
+static inline void memcpy_send_wqe(struct mlx5_ib_wq *sq, void **cur_edge,
+				   void **seg, u32 *wqe_sz, const void *src,
+				   size_t n)
+{
+	while (likely(n)) {
+		size_t leftlen = *cur_edge - *seg;
+		size_t copysz = min_t(size_t, leftlen, n);
+		size_t stride;
+
+		memcpy(*seg, src, copysz);
+
+		n -= copysz;
+		src += copysz;
+		stride = !n ? ALIGN(copysz, 16) : copysz;
+		*seg += stride;
+		*wqe_sz += stride >> 4;
+		handle_post_send_edge(sq, seg, *wqe_sz, cur_edge);
+	}
+}
+
+static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq,
+			    struct ib_cq *ib_cq)
+{
+	struct mlx5_ib_cq *cq;
+	unsigned int cur;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max_post))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max_post;
+}
+
+static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg,
+					  u64 remote_addr, u32 rkey)
+{
+	rseg->raddr    = cpu_to_be64(remote_addr);
+	rseg->rkey     = cpu_to_be32(rkey);
+	rseg->reserved = 0;
+}
+
+static void set_eth_seg(const struct ib_send_wr *wr, struct mlx5_ib_qp *qp,
+			void **seg, int *size, void **cur_edge)
+{
+	struct mlx5_wqe_eth_seg *eseg = *seg;
+
+	memset(eseg, 0, sizeof(struct mlx5_wqe_eth_seg));
+
+	if (wr->send_flags & IB_SEND_IP_CSUM)
+		eseg->cs_flags = MLX5_ETH_WQE_L3_CSUM |
+				 MLX5_ETH_WQE_L4_CSUM;
+
+	if (wr->opcode == IB_WR_LSO) {
+		struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr);
+		size_t left, copysz;
+		void *pdata = ud_wr->header;
+		size_t stride;
+
+		left = ud_wr->hlen;
+		eseg->mss = cpu_to_be16(ud_wr->mss);
+		eseg->inline_hdr.sz = cpu_to_be16(left);
+
+		/* memcpy_send_wqe should get a 16B align address. Hence, we
+		 * first copy up to the current edge and then, if needed,
+		 * continue to memcpy_send_wqe.
+		 */
+		copysz = min_t(u64, *cur_edge - (void *)eseg->inline_hdr.start,
+			       left);
+		memcpy(eseg->inline_hdr.start, pdata, copysz);
+		stride = ALIGN(sizeof(struct mlx5_wqe_eth_seg) -
+			       sizeof(eseg->inline_hdr.start) + copysz, 16);
+		*size += stride / 16;
+		*seg += stride;
+
+		if (copysz < left) {
+			handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+			left -= copysz;
+			pdata += copysz;
+			memcpy_send_wqe(&qp->sq, cur_edge, seg, size, pdata,
+					left);
+		}
+
+		return;
+	}
+
+	*seg += sizeof(struct mlx5_wqe_eth_seg);
+	*size += sizeof(struct mlx5_wqe_eth_seg) / 16;
+}
+
+static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg,
+			     const struct ib_send_wr *wr)
+{
+	memcpy(&dseg->av, &to_mah(ud_wr(wr)->ah)->av, sizeof(struct mlx5_av));
+	dseg->av.dqp_dct =
+		cpu_to_be32(ud_wr(wr)->remote_qpn | MLX5_EXTENDED_UD_AV);
+	dseg->av.key.qkey.qkey = cpu_to_be32(ud_wr(wr)->remote_qkey);
+}
+
+static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->byte_count = cpu_to_be32(sg->length);
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+}
+
+static u64 get_xlt_octo(u64 bytes)
+{
+	return ALIGN(bytes, MLX5_IB_UMR_XLT_ALIGNMENT) /
+	       MLX5_IB_UMR_OCTOWORD;
+}
+
+static __be64 frwr_mkey_mask(bool atomic)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_LEN		|
+		MLX5_MKEY_MASK_PAGE_SIZE	|
+		MLX5_MKEY_MASK_START_ADDR	|
+		MLX5_MKEY_MASK_EN_RINVAL	|
+		MLX5_MKEY_MASK_KEY		|
+		MLX5_MKEY_MASK_LR		|
+		MLX5_MKEY_MASK_LW		|
+		MLX5_MKEY_MASK_RR		|
+		MLX5_MKEY_MASK_RW		|
+		MLX5_MKEY_MASK_SMALL_FENCE	|
+		MLX5_MKEY_MASK_FREE;
+
+	if (atomic)
+		result |= MLX5_MKEY_MASK_A;
+
+	return cpu_to_be64(result);
+}
+
+static __be64 sig_mkey_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_LEN		|
+		MLX5_MKEY_MASK_PAGE_SIZE	|
+		MLX5_MKEY_MASK_START_ADDR	|
+		MLX5_MKEY_MASK_EN_SIGERR	|
+		MLX5_MKEY_MASK_EN_RINVAL	|
+		MLX5_MKEY_MASK_KEY		|
+		MLX5_MKEY_MASK_LR		|
+		MLX5_MKEY_MASK_LW		|
+		MLX5_MKEY_MASK_RR		|
+		MLX5_MKEY_MASK_RW		|
+		MLX5_MKEY_MASK_SMALL_FENCE	|
+		MLX5_MKEY_MASK_FREE		|
+		MLX5_MKEY_MASK_BSF_EN;
+
+	return cpu_to_be64(result);
+}
+
+static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr,
+			    struct mlx5_ib_mr *mr, u8 flags, bool atomic)
+{
+	int size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
+
+	memset(umr, 0, sizeof(*umr));
+
+	umr->flags = flags;
+	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size));
+	umr->mkey_mask = frwr_mkey_mask(atomic);
+}
+
+static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr)
+{
+	memset(umr, 0, sizeof(*umr));
+	umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE);
+	umr->flags = MLX5_UMR_INLINE;
+}
+
+static __be64 get_umr_enable_mr_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_KEY |
+		 MLX5_MKEY_MASK_FREE;
+
+	return cpu_to_be64(result);
+}
+
+static __be64 get_umr_disable_mr_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_FREE;
+
+	return cpu_to_be64(result);
+}
+
+static __be64 get_umr_update_translation_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_LEN |
+		 MLX5_MKEY_MASK_PAGE_SIZE |
+		 MLX5_MKEY_MASK_START_ADDR;
+
+	return cpu_to_be64(result);
+}
+
+static __be64 get_umr_update_access_mask(int atomic,
+					 int relaxed_ordering_write,
+					 int relaxed_ordering_read)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_LR |
+		 MLX5_MKEY_MASK_LW |
+		 MLX5_MKEY_MASK_RR |
+		 MLX5_MKEY_MASK_RW;
+
+	if (atomic)
+		result |= MLX5_MKEY_MASK_A;
+
+	if (relaxed_ordering_write)
+		result |= MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE;
+
+	if (relaxed_ordering_read)
+		result |= MLX5_MKEY_MASK_RELAXED_ORDERING_READ;
+
+	return cpu_to_be64(result);
+}
+
+static __be64 get_umr_update_pd_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_PD;
+
+	return cpu_to_be64(result);
+}
+
+static int umr_check_mkey_mask(struct mlx5_ib_dev *dev, u64 mask)
+{
+	if (mask & MLX5_MKEY_MASK_PAGE_SIZE &&
+	    MLX5_CAP_GEN(dev->mdev, umr_modify_entity_size_disabled))
+		return -EPERM;
+
+	if (mask & MLX5_MKEY_MASK_A &&
+	    MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled))
+		return -EPERM;
+
+	if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_WRITE &&
+	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
+		return -EPERM;
+
+	if (mask & MLX5_MKEY_MASK_RELAXED_ORDERING_READ &&
+	    !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
+		return -EPERM;
+
+	return 0;
+}
+
+static int set_reg_umr_segment(struct mlx5_ib_dev *dev,
+			       struct mlx5_wqe_umr_ctrl_seg *umr,
+			       const struct ib_send_wr *wr)
+{
+	const struct mlx5_umr_wr *umrwr = umr_wr(wr);
+
+	memset(umr, 0, sizeof(*umr));
+
+	if (!umrwr->ignore_free_state) {
+		if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE)
+			 /* fail if free */
+			umr->flags = MLX5_UMR_CHECK_FREE;
+		else
+			/* fail if not free */
+			umr->flags = MLX5_UMR_CHECK_NOT_FREE;
+	}
+
+	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(umrwr->xlt_size));
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_XLT) {
+		u64 offset = get_xlt_octo(umrwr->offset);
+
+		umr->xlt_offset = cpu_to_be16(offset & 0xffff);
+		umr->xlt_offset_47_16 = cpu_to_be32(offset >> 16);
+		umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
+	}
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION)
+		umr->mkey_mask |= get_umr_update_translation_mask();
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS) {
+		umr->mkey_mask |= get_umr_update_access_mask(
+			!!(MLX5_CAP_GEN(dev->mdev, atomic)),
+			!!(MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)),
+			!!(MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)));
+		umr->mkey_mask |= get_umr_update_pd_mask();
+	}
+	if (wr->send_flags & MLX5_IB_SEND_UMR_ENABLE_MR)
+		umr->mkey_mask |= get_umr_enable_mr_mask();
+	if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR)
+		umr->mkey_mask |= get_umr_disable_mr_mask();
+
+	if (!wr->num_sge)
+		umr->flags |= MLX5_UMR_INLINE;
+
+	return umr_check_mkey_mask(dev, be64_to_cpu(umr->mkey_mask));
+}
+
+static u8 get_umr_flags(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX5_PERM_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MLX5_PERM_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX5_PERM_LOCAL_WRITE  : 0) |
+		MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN;
+}
+
+static void set_reg_mkey_seg(struct mlx5_mkey_seg *seg,
+			     struct mlx5_ib_mr *mr,
+			     u32 key, int access)
+{
+	int ndescs = ALIGN(mr->ndescs + mr->meta_ndescs, 8) >> 1;
+
+	memset(seg, 0, sizeof(*seg));
+
+	if (mr->access_mode == MLX5_MKC_ACCESS_MODE_MTT)
+		seg->log2_page_size = ilog2(mr->ibmr.page_size);
+	else if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS)
+		/* KLMs take twice the size of MTTs */
+		ndescs *= 2;
+
+	seg->flags = get_umr_flags(access) | mr->access_mode;
+	seg->qpn_mkey7_0 = cpu_to_be32((key & 0xff) | 0xffffff00);
+	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL);
+	seg->start_addr = cpu_to_be64(mr->ibmr.iova);
+	seg->len = cpu_to_be64(mr->ibmr.length);
+	seg->xlt_oct_size = cpu_to_be32(ndescs);
+}
+
+static void set_linv_mkey_seg(struct mlx5_mkey_seg *seg)
+{
+	memset(seg, 0, sizeof(*seg));
+	seg->status = MLX5_MKEY_STATUS_FREE;
+}
+
+static void set_reg_mkey_segment(struct mlx5_ib_dev *dev,
+				 struct mlx5_mkey_seg *seg,
+				 const struct ib_send_wr *wr)
+{
+	const struct mlx5_umr_wr *umrwr = umr_wr(wr);
+
+	memset(seg, 0, sizeof(*seg));
+	if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR)
+		MLX5_SET(mkc, seg, free, 1);
+
+	MLX5_SET(mkc, seg, a,
+		 !!(umrwr->access_flags & IB_ACCESS_REMOTE_ATOMIC));
+	MLX5_SET(mkc, seg, rw,
+		 !!(umrwr->access_flags & IB_ACCESS_REMOTE_WRITE));
+	MLX5_SET(mkc, seg, rr, !!(umrwr->access_flags & IB_ACCESS_REMOTE_READ));
+	MLX5_SET(mkc, seg, lw, !!(umrwr->access_flags & IB_ACCESS_LOCAL_WRITE));
+	MLX5_SET(mkc, seg, lr, 1);
+	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr))
+		MLX5_SET(mkc, seg, relaxed_ordering_write,
+			 !!(umrwr->access_flags & IB_ACCESS_RELAXED_ORDERING));
+	if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr))
+		MLX5_SET(mkc, seg, relaxed_ordering_read,
+			 !!(umrwr->access_flags & IB_ACCESS_RELAXED_ORDERING));
+
+	if (umrwr->pd)
+		MLX5_SET(mkc, seg, pd, to_mpd(umrwr->pd)->pdn);
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION &&
+	    !umrwr->length)
+		MLX5_SET(mkc, seg, length64, 1);
+
+	MLX5_SET64(mkc, seg, start_addr, umrwr->virt_addr);
+	MLX5_SET64(mkc, seg, len, umrwr->length);
+	MLX5_SET(mkc, seg, log_page_size, umrwr->page_shift);
+	MLX5_SET(mkc, seg, qpn, 0xffffff);
+	MLX5_SET(mkc, seg, mkey_7_0, mlx5_mkey_variant(umrwr->mkey));
+}
+
+static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg,
+			     struct mlx5_ib_mr *mr,
+			     struct mlx5_ib_pd *pd)
+{
+	int bcount = mr->desc_size * (mr->ndescs + mr->meta_ndescs);
+
+	dseg->addr = cpu_to_be64(mr->desc_map);
+	dseg->byte_count = cpu_to_be32(ALIGN(bcount, 64));
+	dseg->lkey = cpu_to_be32(pd->ibpd.local_dma_lkey);
+}
+
+static __be32 send_ieth(const struct ib_send_wr *wr)
+{
+	switch (wr->opcode) {
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return wr->ex.imm_data;
+
+	case IB_WR_SEND_WITH_INV:
+		return cpu_to_be32(wr->ex.invalidate_rkey);
+
+	default:
+		return 0;
+	}
+}
+
+static u8 calc_sig(void *wqe, int size)
+{
+	u8 *p = wqe;
+	u8 res = 0;
+	int i;
+
+	for (i = 0; i < size; i++)
+		res ^= p[i];
+
+	return ~res;
+}
+
+static u8 wq_sig(void *wqe)
+{
+	return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4);
+}
+
+static int set_data_inl_seg(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr,
+			    void **wqe, int *wqe_sz, void **cur_edge)
+{
+	struct mlx5_wqe_inline_seg *seg;
+	size_t offset;
+	int inl = 0;
+	int i;
+
+	seg = *wqe;
+	*wqe += sizeof(*seg);
+	offset = sizeof(*seg);
+
+	for (i = 0; i < wr->num_sge; i++) {
+		size_t len  = wr->sg_list[i].length;
+		void *addr = (void *)(unsigned long)(wr->sg_list[i].addr);
+
+		inl += len;
+
+		if (unlikely(inl > qp->max_inline_data))
+			return -ENOMEM;
+
+		while (likely(len)) {
+			size_t leftlen;
+			size_t copysz;
+
+			handle_post_send_edge(&qp->sq, wqe,
+					      *wqe_sz + (offset >> 4),
+					      cur_edge);
+
+			leftlen = *cur_edge - *wqe;
+			copysz = min_t(size_t, leftlen, len);
+
+			memcpy(*wqe, addr, copysz);
+			len -= copysz;
+			addr += copysz;
+			*wqe += copysz;
+			offset += copysz;
+		}
+	}
+
+	seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG);
+
+	*wqe_sz +=  ALIGN(inl + sizeof(seg->byte_count), 16) / 16;
+
+	return 0;
+}
+
+static u16 prot_field_size(enum ib_signature_type type)
+{
+	switch (type) {
+	case IB_SIG_TYPE_T10_DIF:
+		return MLX5_DIF_SIZE;
+	default:
+		return 0;
+	}
+}
+
+static u8 bs_selector(int block_size)
+{
+	switch (block_size) {
+	case 512:	    return 0x1;
+	case 520:	    return 0x2;
+	case 4096:	    return 0x3;
+	case 4160:	    return 0x4;
+	case 1073741824:    return 0x5;
+	default:	    return 0;
+	}
+}
+
+static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain,
+			      struct mlx5_bsf_inl *inl)
+{
+	/* Valid inline section and allow BSF refresh */
+	inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID |
+				       MLX5_BSF_REFRESH_DIF);
+	inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag);
+	inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag);
+	/* repeating block */
+	inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK;
+	inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ?
+			MLX5_DIF_CRC : MLX5_DIF_IPCS;
+
+	if (domain->sig.dif.ref_remap)
+		inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG;
+
+	if (domain->sig.dif.app_escape) {
+		if (domain->sig.dif.ref_escape)
+			inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE;
+		else
+			inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE;
+	}
+
+	inl->dif_app_bitmask_check =
+		cpu_to_be16(domain->sig.dif.apptag_check_mask);
+}
+
+static int mlx5_set_bsf(struct ib_mr *sig_mr,
+			struct ib_sig_attrs *sig_attrs,
+			struct mlx5_bsf *bsf, u32 data_size)
+{
+	struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig;
+	struct mlx5_bsf_basic *basic = &bsf->basic;
+	struct ib_sig_domain *mem = &sig_attrs->mem;
+	struct ib_sig_domain *wire = &sig_attrs->wire;
+
+	memset(bsf, 0, sizeof(*bsf));
+
+	/* Basic + Extended + Inline */
+	basic->bsf_size_sbs = 1 << 7;
+	/* Input domain check byte mask */
+	basic->check_byte_mask = sig_attrs->check_mask;
+	basic->raw_data_size = cpu_to_be32(data_size);
+
+	/* Memory domain */
+	switch (sig_attrs->mem.sig_type) {
+	case IB_SIG_TYPE_NONE:
+		break;
+	case IB_SIG_TYPE_T10_DIF:
+		basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval);
+		basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx);
+		mlx5_fill_inl_bsf(mem, &bsf->m_inl);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Wire domain */
+	switch (sig_attrs->wire.sig_type) {
+	case IB_SIG_TYPE_NONE:
+		break;
+	case IB_SIG_TYPE_T10_DIF:
+		if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval &&
+		    mem->sig_type == wire->sig_type) {
+			/* Same block structure */
+			basic->bsf_size_sbs |= 1 << 4;
+			if (mem->sig.dif.bg_type == wire->sig.dif.bg_type)
+				basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK;
+			if (mem->sig.dif.app_tag == wire->sig.dif.app_tag)
+				basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK;
+			if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag)
+				basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK;
+		} else
+			basic->wire.bs_selector =
+				bs_selector(wire->sig.dif.pi_interval);
+
+		basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx);
+		mlx5_fill_inl_bsf(wire, &bsf->w_inl);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+
+static int set_sig_data_segment(const struct ib_send_wr *send_wr,
+				struct ib_mr *sig_mr,
+				struct ib_sig_attrs *sig_attrs,
+				struct mlx5_ib_qp *qp, void **seg, int *size,
+				void **cur_edge)
+{
+	struct mlx5_bsf *bsf;
+	u32 data_len;
+	u32 data_key;
+	u64 data_va;
+	u32 prot_len = 0;
+	u32 prot_key = 0;
+	u64 prot_va = 0;
+	bool prot = false;
+	int ret;
+	int wqe_size;
+	struct mlx5_ib_mr *mr = to_mmr(sig_mr);
+	struct mlx5_ib_mr *pi_mr = mr->pi_mr;
+
+	data_len = pi_mr->data_length;
+	data_key = pi_mr->ibmr.lkey;
+	data_va = pi_mr->data_iova;
+	if (pi_mr->meta_ndescs) {
+		prot_len = pi_mr->meta_length;
+		prot_key = pi_mr->ibmr.lkey;
+		prot_va = pi_mr->pi_iova;
+		prot = true;
+	}
+
+	if (!prot || (data_key == prot_key && data_va == prot_va &&
+		      data_len == prot_len)) {
+		/**
+		 * Source domain doesn't contain signature information
+		 * or data and protection are interleaved in memory.
+		 * So need construct:
+		 *                  ------------------
+		 *                 |     data_klm     |
+		 *                  ------------------
+		 *                 |       BSF        |
+		 *                  ------------------
+		 **/
+		struct mlx5_klm *data_klm = *seg;
+
+		data_klm->bcount = cpu_to_be32(data_len);
+		data_klm->key = cpu_to_be32(data_key);
+		data_klm->va = cpu_to_be64(data_va);
+		wqe_size = ALIGN(sizeof(*data_klm), 64);
+	} else {
+		/**
+		 * Source domain contains signature information
+		 * So need construct a strided block format:
+		 *               ---------------------------
+		 *              |     stride_block_ctrl     |
+		 *               ---------------------------
+		 *              |          data_klm         |
+		 *               ---------------------------
+		 *              |          prot_klm         |
+		 *               ---------------------------
+		 *              |             BSF           |
+		 *               ---------------------------
+		 **/
+		struct mlx5_stride_block_ctrl_seg *sblock_ctrl;
+		struct mlx5_stride_block_entry *data_sentry;
+		struct mlx5_stride_block_entry *prot_sentry;
+		u16 block_size = sig_attrs->mem.sig.dif.pi_interval;
+		int prot_size;
+
+		sblock_ctrl = *seg;
+		data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl);
+		prot_sentry = (void *)data_sentry + sizeof(*data_sentry);
+
+		prot_size = prot_field_size(sig_attrs->mem.sig_type);
+		if (!prot_size) {
+			pr_err("Bad block size given: %u\n", block_size);
+			return -EINVAL;
+		}
+		sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size +
+							    prot_size);
+		sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP);
+		sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size);
+		sblock_ctrl->num_entries = cpu_to_be16(2);
+
+		data_sentry->bcount = cpu_to_be16(block_size);
+		data_sentry->key = cpu_to_be32(data_key);
+		data_sentry->va = cpu_to_be64(data_va);
+		data_sentry->stride = cpu_to_be16(block_size);
+
+		prot_sentry->bcount = cpu_to_be16(prot_size);
+		prot_sentry->key = cpu_to_be32(prot_key);
+		prot_sentry->va = cpu_to_be64(prot_va);
+		prot_sentry->stride = cpu_to_be16(prot_size);
+
+		wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) +
+				 sizeof(*prot_sentry), 64);
+	}
+
+	*seg += wqe_size;
+	*size += wqe_size / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+
+	bsf = *seg;
+	ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len);
+	if (ret)
+		return -EINVAL;
+
+	*seg += sizeof(*bsf);
+	*size += sizeof(*bsf) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+
+	return 0;
+}
+
+static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg,
+				 struct ib_mr *sig_mr, int access_flags,
+				 u32 size, u32 length, u32 pdn)
+{
+	u32 sig_key = sig_mr->rkey;
+	u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1;
+
+	memset(seg, 0, sizeof(*seg));
+
+	seg->flags = get_umr_flags(access_flags) | MLX5_MKC_ACCESS_MODE_KLMS;
+	seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00);
+	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 |
+				    MLX5_MKEY_BSF_EN | pdn);
+	seg->len = cpu_to_be64(length);
+	seg->xlt_oct_size = cpu_to_be32(get_xlt_octo(size));
+	seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE);
+}
+
+static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
+				u32 size)
+{
+	memset(umr, 0, sizeof(*umr));
+
+	umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE;
+	umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size));
+	umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE);
+	umr->mkey_mask = sig_mkey_mask();
+}
+
+static int set_pi_umr_wr(const struct ib_send_wr *send_wr,
+			 struct mlx5_ib_qp *qp, void **seg, int *size,
+			 void **cur_edge)
+{
+	const struct ib_reg_wr *wr = reg_wr(send_wr);
+	struct mlx5_ib_mr *sig_mr = to_mmr(wr->mr);
+	struct mlx5_ib_mr *pi_mr = sig_mr->pi_mr;
+	struct ib_sig_attrs *sig_attrs = sig_mr->ibmr.sig_attrs;
+	u32 pdn = to_mpd(qp->ibqp.pd)->pdn;
+	u32 xlt_size;
+	int region_len, ret;
+
+	if (unlikely(send_wr->num_sge != 0) ||
+	    unlikely(wr->access & IB_ACCESS_REMOTE_ATOMIC) ||
+	    unlikely(!sig_mr->sig) || unlikely(!qp->ibqp.integrity_en) ||
+	    unlikely(!sig_mr->sig->sig_status_checked))
+		return -EINVAL;
+
+	/* length of the protected region, data + protection */
+	region_len = pi_mr->ibmr.length;
+
+	/**
+	 * KLM octoword size - if protection was provided
+	 * then we use strided block format (3 octowords),
+	 * else we use single KLM (1 octoword)
+	 **/
+	if (sig_attrs->mem.sig_type != IB_SIG_TYPE_NONE)
+		xlt_size = 0x30;
+	else
+		xlt_size = sizeof(struct mlx5_klm);
+
+	set_sig_umr_segment(*seg, xlt_size);
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+
+	set_sig_mkey_segment(*seg, wr->mr, wr->access, xlt_size, region_len,
+			     pdn);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+
+	ret = set_sig_data_segment(send_wr, wr->mr, sig_attrs, qp, seg, size,
+				   cur_edge);
+	if (ret)
+		return ret;
+
+	sig_mr->sig->sig_status_checked = false;
+	return 0;
+}
+
+static int set_psv_wr(struct ib_sig_domain *domain,
+		      u32 psv_idx, void **seg, int *size)
+{
+	struct mlx5_seg_set_psv *psv_seg = *seg;
+
+	memset(psv_seg, 0, sizeof(*psv_seg));
+	psv_seg->psv_num = cpu_to_be32(psv_idx);
+	switch (domain->sig_type) {
+	case IB_SIG_TYPE_NONE:
+		break;
+	case IB_SIG_TYPE_T10_DIF:
+		psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 |
+						     domain->sig.dif.app_tag);
+		psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag);
+		break;
+	default:
+		pr_err("Bad signature type (%d) is given.\n",
+		       domain->sig_type);
+		return -EINVAL;
+	}
+
+	*seg += sizeof(*psv_seg);
+	*size += sizeof(*psv_seg) / 16;
+
+	return 0;
+}
+
+static int set_reg_wr(struct mlx5_ib_qp *qp,
+		      const struct ib_reg_wr *wr,
+		      void **seg, int *size, void **cur_edge,
+		      bool check_not_free)
+{
+	struct mlx5_ib_mr *mr = to_mmr(wr->mr);
+	struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd);
+	struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device);
+	int mr_list_size = (mr->ndescs + mr->meta_ndescs) * mr->desc_size;
+	bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD;
+	bool atomic = wr->access & IB_ACCESS_REMOTE_ATOMIC;
+	u8 flags = 0;
+
+	/* Matches access in mlx5_set_umr_free_mkey() */
+	if (!mlx5_ib_can_reconfig_with_umr(dev, 0, wr->access)) {
+		mlx5_ib_warn(
+			to_mdev(qp->ibqp.device),
+			"Fast update for MR access flags is not possible\n");
+		return -EINVAL;
+	}
+
+	if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) {
+		mlx5_ib_warn(to_mdev(qp->ibqp.device),
+			     "Invalid IB_SEND_INLINE send flag\n");
+		return -EINVAL;
+	}
+
+	if (check_not_free)
+		flags |= MLX5_UMR_CHECK_NOT_FREE;
+	if (umr_inline)
+		flags |= MLX5_UMR_INLINE;
+
+	set_reg_umr_seg(*seg, mr, flags, atomic);
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+
+	set_reg_mkey_seg(*seg, mr, wr->key, wr->access);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+
+	if (umr_inline) {
+		memcpy_send_wqe(&qp->sq, cur_edge, seg, size, mr->descs,
+				mr_list_size);
+		*size = ALIGN(*size, MLX5_SEND_WQE_BB >> 4);
+	} else {
+		set_reg_data_seg(*seg, mr, pd);
+		*seg += sizeof(struct mlx5_wqe_data_seg);
+		*size += (sizeof(struct mlx5_wqe_data_seg) / 16);
+	}
+	return 0;
+}
+
+static void set_linv_wr(struct mlx5_ib_qp *qp, void **seg, int *size,
+			void **cur_edge)
+{
+	set_linv_umr_seg(*seg);
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+	set_linv_mkey_seg(*seg);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+}
+
+static void dump_wqe(struct mlx5_ib_qp *qp, u32 idx, int size_16)
+{
+	__be32 *p = NULL;
+	int i, j;
+
+	pr_debug("dump WQE index %u:\n", idx);
+	for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) {
+		if ((i & 0xf) == 0) {
+			p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx);
+			pr_debug("WQBB at %p:\n", (void *)p);
+			j = 0;
+			idx = (idx + 1) & (qp->sq.wqe_cnt - 1);
+		}
+		pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]),
+			 be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]),
+			 be32_to_cpu(p[j + 3]));
+	}
+}
+
+static int __begin_wqe(struct mlx5_ib_qp *qp, void **seg,
+		       struct mlx5_wqe_ctrl_seg **ctrl,
+		       const struct ib_send_wr *wr, unsigned int *idx,
+		       int *size, void **cur_edge, int nreq,
+		       bool send_signaled, bool solicited)
+{
+	if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)))
+		return -ENOMEM;
+
+	*idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
+	*seg = mlx5_frag_buf_get_wqe(&qp->sq.fbc, *idx);
+	*ctrl = *seg;
+	*(uint32_t *)(*seg + 8) = 0;
+	(*ctrl)->imm = send_ieth(wr);
+	(*ctrl)->fm_ce_se = qp->sq_signal_bits |
+		(send_signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0) |
+		(solicited ? MLX5_WQE_CTRL_SOLICITED : 0);
+
+	*seg += sizeof(**ctrl);
+	*size = sizeof(**ctrl) / 16;
+	*cur_edge = qp->sq.cur_edge;
+
+	return 0;
+}
+
+static int begin_wqe(struct mlx5_ib_qp *qp, void **seg,
+		     struct mlx5_wqe_ctrl_seg **ctrl,
+		     const struct ib_send_wr *wr, unsigned int *idx, int *size,
+		     void **cur_edge, int nreq)
+{
+	return __begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq,
+			   wr->send_flags & IB_SEND_SIGNALED,
+			   wr->send_flags & IB_SEND_SOLICITED);
+}
+
+static void finish_wqe(struct mlx5_ib_qp *qp,
+		       struct mlx5_wqe_ctrl_seg *ctrl,
+		       void *seg, u8 size, void *cur_edge,
+		       unsigned int idx, u64 wr_id, int nreq, u8 fence,
+		       u32 mlx5_opcode)
+{
+	u8 opmod = 0;
+
+	ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) |
+					     mlx5_opcode | ((u32)opmod << 24));
+	ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8));
+	ctrl->fm_ce_se |= fence;
+	if (unlikely(qp->flags_en & MLX5_QP_FLAG_SIGNATURE))
+		ctrl->signature = wq_sig(ctrl);
+
+	qp->sq.wrid[idx] = wr_id;
+	qp->sq.w_list[idx].opcode = mlx5_opcode;
+	qp->sq.wqe_head[idx] = qp->sq.head + nreq;
+	qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
+	qp->sq.w_list[idx].next = qp->sq.cur_post;
+
+	/* We save the edge which was possibly updated during the WQE
+	 * construction, into SQ's cache.
+	 */
+	seg = PTR_ALIGN(seg, MLX5_SEND_WQE_BB);
+	qp->sq.cur_edge = (unlikely(seg == cur_edge)) ?
+			  get_sq_edge(&qp->sq, qp->sq.cur_post &
+				      (qp->sq.wqe_cnt - 1)) :
+			  cur_edge;
+}
+
+static void handle_rdma_op(const struct ib_send_wr *wr, void **seg, int *size)
+{
+	set_raddr_seg(*seg, rdma_wr(wr)->remote_addr, rdma_wr(wr)->rkey);
+	*seg += sizeof(struct mlx5_wqe_raddr_seg);
+	*size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
+}
+
+static void handle_local_inv(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr,
+			     struct mlx5_wqe_ctrl_seg **ctrl, void **seg,
+			     int *size, void **cur_edge, unsigned int idx)
+{
+	qp->sq.wr_data[idx] = IB_WR_LOCAL_INV;
+	(*ctrl)->imm = cpu_to_be32(wr->ex.invalidate_rkey);
+	set_linv_wr(qp, seg, size, cur_edge);
+}
+
+static int handle_reg_mr(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr,
+			 struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size,
+			 void **cur_edge, unsigned int idx)
+{
+	qp->sq.wr_data[idx] = IB_WR_REG_MR;
+	(*ctrl)->imm = cpu_to_be32(reg_wr(wr)->key);
+	return set_reg_wr(qp, reg_wr(wr), seg, size, cur_edge, true);
+}
+
+static int handle_psv(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+		      const struct ib_send_wr *wr,
+		      struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size,
+		      void **cur_edge, unsigned int *idx, int nreq,
+		      struct ib_sig_domain *domain, u32 psv_index,
+		      u8 next_fence)
+{
+	int err;
+
+	/*
+	 * SET_PSV WQEs are not signaled and solicited on error.
+	 */
+	err = __begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq,
+			  false, true);
+	if (unlikely(err)) {
+		mlx5_ib_warn(dev, "\n");
+		err = -ENOMEM;
+		goto out;
+	}
+	err = set_psv_wr(domain, psv_index, seg, size);
+	if (unlikely(err)) {
+		mlx5_ib_warn(dev, "\n");
+		goto out;
+	}
+	finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, nreq,
+		   next_fence, MLX5_OPCODE_SET_PSV);
+
+out:
+	return err;
+}
+
+static int handle_reg_mr_integrity(struct mlx5_ib_dev *dev,
+				   struct mlx5_ib_qp *qp,
+				   const struct ib_send_wr *wr,
+				   struct mlx5_wqe_ctrl_seg **ctrl, void **seg,
+				   int *size, void **cur_edge,
+				   unsigned int *idx, int nreq, u8 fence,
+				   u8 next_fence)
+{
+	struct mlx5_ib_mr *mr;
+	struct mlx5_ib_mr *pi_mr;
+	struct mlx5_ib_mr pa_pi_mr;
+	struct ib_sig_attrs *sig_attrs;
+	struct ib_reg_wr reg_pi_wr;
+	int err;
+
+	qp->sq.wr_data[*idx] = IB_WR_REG_MR_INTEGRITY;
+
+	mr = to_mmr(reg_wr(wr)->mr);
+	pi_mr = mr->pi_mr;
+
+	if (pi_mr) {
+		memset(&reg_pi_wr, 0,
+		       sizeof(struct ib_reg_wr));
+
+		reg_pi_wr.mr = &pi_mr->ibmr;
+		reg_pi_wr.access = reg_wr(wr)->access;
+		reg_pi_wr.key = pi_mr->ibmr.rkey;
+
+		(*ctrl)->imm = cpu_to_be32(reg_pi_wr.key);
+		/* UMR for data + prot registration */
+		err = set_reg_wr(qp, &reg_pi_wr, seg, size, cur_edge, false);
+		if (unlikely(err))
+			goto out;
+
+		finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id,
+			   nreq, fence, MLX5_OPCODE_UMR);
+
+		err = begin_wqe(qp, seg, ctrl, wr, idx, size, cur_edge, nreq);
+		if (unlikely(err)) {
+			mlx5_ib_warn(dev, "\n");
+			err = -ENOMEM;
+			goto out;
+		}
+	} else {
+		memset(&pa_pi_mr, 0, sizeof(struct mlx5_ib_mr));
+		/* No UMR, use local_dma_lkey */
+		pa_pi_mr.ibmr.lkey = mr->ibmr.pd->local_dma_lkey;
+		pa_pi_mr.ndescs = mr->ndescs;
+		pa_pi_mr.data_length = mr->data_length;
+		pa_pi_mr.data_iova = mr->data_iova;
+		if (mr->meta_ndescs) {
+			pa_pi_mr.meta_ndescs = mr->meta_ndescs;
+			pa_pi_mr.meta_length = mr->meta_length;
+			pa_pi_mr.pi_iova = mr->pi_iova;
+		}
+
+		pa_pi_mr.ibmr.length = mr->ibmr.length;
+		mr->pi_mr = &pa_pi_mr;
+	}
+	(*ctrl)->imm = cpu_to_be32(mr->ibmr.rkey);
+	/* UMR for sig MR */
+	err = set_pi_umr_wr(wr, qp, seg, size, cur_edge);
+	if (unlikely(err)) {
+		mlx5_ib_warn(dev, "\n");
+		goto out;
+	}
+	finish_wqe(qp, *ctrl, *seg, *size, *cur_edge, *idx, wr->wr_id, nreq,
+		   fence, MLX5_OPCODE_UMR);
+
+	sig_attrs = mr->ibmr.sig_attrs;
+	err = handle_psv(dev, qp, wr, ctrl, seg, size, cur_edge, idx, nreq,
+			 &sig_attrs->mem, mr->sig->psv_memory.psv_idx,
+			 next_fence);
+	if (unlikely(err))
+		goto out;
+
+	err = handle_psv(dev, qp, wr, ctrl, seg, size, cur_edge, idx, nreq,
+			 &sig_attrs->wire, mr->sig->psv_wire.psv_idx,
+			 next_fence);
+	if (unlikely(err))
+		goto out;
+
+	qp->next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
+
+out:
+	return err;
+}
+
+static int handle_qpt_rc(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+			 const struct ib_send_wr *wr,
+			 struct mlx5_wqe_ctrl_seg **ctrl, void **seg, int *size,
+			 void **cur_edge, unsigned int *idx, int nreq, u8 fence,
+			 u8 next_fence, int *num_sge)
+{
+	int err = 0;
+
+	switch (wr->opcode) {
+	case IB_WR_RDMA_READ:
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		handle_rdma_op(wr, seg, size);
+		break;
+
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+	case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+		mlx5_ib_warn(dev, "Atomic operations are not supported yet\n");
+		err = -EOPNOTSUPP;
+		goto out;
+
+	case IB_WR_LOCAL_INV:
+		handle_local_inv(qp, wr, ctrl, seg, size, cur_edge, *idx);
+		*num_sge = 0;
+		break;
+
+	case IB_WR_REG_MR:
+		err = handle_reg_mr(qp, wr, ctrl, seg, size, cur_edge, *idx);
+		if (unlikely(err))
+			goto out;
+		*num_sge = 0;
+		break;
+
+	case IB_WR_REG_MR_INTEGRITY:
+		err = handle_reg_mr_integrity(dev, qp, wr, ctrl, seg, size,
+					      cur_edge, idx, nreq, fence,
+					      next_fence);
+		if (unlikely(err))
+			goto out;
+		*num_sge = 0;
+		break;
+
+	default:
+		break;
+	}
+
+out:
+	return err;
+}
+
+static void handle_qpt_uc(const struct ib_send_wr *wr, void **seg, int *size)
+{
+	switch (wr->opcode) {
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		handle_rdma_op(wr, seg, size);
+		break;
+	default:
+		break;
+	}
+}
+
+static void handle_qpt_hw_gsi(struct mlx5_ib_qp *qp,
+			      const struct ib_send_wr *wr, void **seg,
+			      int *size, void **cur_edge)
+{
+	set_datagram_seg(*seg, wr);
+	*seg += sizeof(struct mlx5_wqe_datagram_seg);
+	*size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+}
+
+static void handle_qpt_ud(struct mlx5_ib_qp *qp, const struct ib_send_wr *wr,
+			  void **seg, int *size, void **cur_edge)
+{
+	set_datagram_seg(*seg, wr);
+	*seg += sizeof(struct mlx5_wqe_datagram_seg);
+	*size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+
+	/* handle qp that supports ud offload */
+	if (qp->flags & IB_QP_CREATE_IPOIB_UD_LSO) {
+		struct mlx5_wqe_eth_pad *pad;
+
+		pad = *seg;
+		memset(pad, 0, sizeof(struct mlx5_wqe_eth_pad));
+		*seg += sizeof(struct mlx5_wqe_eth_pad);
+		*size += sizeof(struct mlx5_wqe_eth_pad) / 16;
+		set_eth_seg(wr, qp, seg, size, cur_edge);
+		handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+	}
+}
+
+static int handle_qpt_reg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+			      const struct ib_send_wr *wr,
+			      struct mlx5_wqe_ctrl_seg **ctrl, void **seg,
+			      int *size, void **cur_edge, unsigned int idx)
+{
+	int err = 0;
+
+	if (unlikely(wr->opcode != MLX5_IB_WR_UMR)) {
+		err = -EINVAL;
+		mlx5_ib_warn(dev, "bad opcode %d\n", wr->opcode);
+		goto out;
+	}
+
+	qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
+	(*ctrl)->imm = cpu_to_be32(umr_wr(wr)->mkey);
+	err = set_reg_umr_segment(dev, *seg, wr);
+	if (unlikely(err))
+		goto out;
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+	set_reg_mkey_segment(dev, *seg, wr);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	handle_post_send_edge(&qp->sq, seg, *size, cur_edge);
+out:
+	return err;
+}
+
+int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
+		      const struct ib_send_wr **bad_wr, bool drain)
+{
+	struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_ib_qp *qp;
+	struct mlx5_wqe_xrc_seg *xrc;
+	struct mlx5_bf *bf;
+	void *cur_edge;
+	int size;
+	unsigned long flags;
+	unsigned int idx;
+	int err = 0;
+	int num_sge;
+	void *seg;
+	int nreq;
+	int i;
+	u8 next_fence = 0;
+	u8 fence;
+
+	if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR &&
+		     !drain)) {
+		*bad_wr = wr;
+		return -EIO;
+	}
+
+	if (unlikely(ibqp->qp_type == IB_QPT_GSI))
+		return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr);
+
+	qp = to_mqp(ibqp);
+	bf = &qp->bf;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	for (nreq = 0; wr; nreq++, wr = wr->next) {
+		if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) {
+			mlx5_ib_warn(dev, "\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		num_sge = wr->num_sge;
+		if (unlikely(num_sge > qp->sq.max_gs)) {
+			mlx5_ib_warn(dev, "\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, &cur_edge,
+				nreq);
+		if (err) {
+			mlx5_ib_warn(dev, "\n");
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (wr->opcode == IB_WR_REG_MR ||
+		    wr->opcode == IB_WR_REG_MR_INTEGRITY) {
+			fence = dev->umr_fence;
+			next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
+		} else  {
+			if (wr->send_flags & IB_SEND_FENCE) {
+				if (qp->next_fence)
+					fence = MLX5_FENCE_MODE_SMALL_AND_FENCE;
+				else
+					fence = MLX5_FENCE_MODE_FENCE;
+			} else {
+				fence = qp->next_fence;
+			}
+		}
+
+		switch (ibqp->qp_type) {
+		case IB_QPT_XRC_INI:
+			xrc = seg;
+			seg += sizeof(*xrc);
+			size += sizeof(*xrc) / 16;
+			fallthrough;
+		case IB_QPT_RC:
+			err = handle_qpt_rc(dev, qp, wr, &ctrl, &seg, &size,
+					    &cur_edge, &idx, nreq, fence,
+					    next_fence, &num_sge);
+			if (unlikely(err)) {
+				*bad_wr = wr;
+				goto out;
+			} else if (wr->opcode == IB_WR_REG_MR_INTEGRITY) {
+				goto skip_psv;
+			}
+			break;
+
+		case IB_QPT_UC:
+			handle_qpt_uc(wr, &seg, &size);
+			break;
+		case IB_QPT_SMI:
+			if (unlikely(!mdev->port_caps[qp->port - 1].has_smi)) {
+				mlx5_ib_warn(dev, "Send SMP MADs is not allowed\n");
+				err = -EPERM;
+				*bad_wr = wr;
+				goto out;
+			}
+			fallthrough;
+		case MLX5_IB_QPT_HW_GSI:
+			handle_qpt_hw_gsi(qp, wr, &seg, &size, &cur_edge);
+			break;
+		case IB_QPT_UD:
+			handle_qpt_ud(qp, wr, &seg, &size, &cur_edge);
+			break;
+		case MLX5_IB_QPT_REG_UMR:
+			err = handle_qpt_reg_umr(dev, qp, wr, &ctrl, &seg,
+						       &size, &cur_edge, idx);
+			if (unlikely(err))
+				goto out;
+			break;
+
+		default:
+			break;
+		}
+
+		if (wr->send_flags & IB_SEND_INLINE && num_sge) {
+			err = set_data_inl_seg(qp, wr, &seg, &size, &cur_edge);
+			if (unlikely(err)) {
+				mlx5_ib_warn(dev, "\n");
+				*bad_wr = wr;
+				goto out;
+			}
+		} else {
+			for (i = 0; i < num_sge; i++) {
+				handle_post_send_edge(&qp->sq, &seg, size,
+						      &cur_edge);
+				if (unlikely(!wr->sg_list[i].length))
+					continue;
+
+				set_data_ptr_seg(
+					(struct mlx5_wqe_data_seg *)seg,
+					wr->sg_list + i);
+				size += sizeof(struct mlx5_wqe_data_seg) / 16;
+				seg += sizeof(struct mlx5_wqe_data_seg);
+			}
+		}
+
+		qp->next_fence = next_fence;
+		finish_wqe(qp, ctrl, seg, size, cur_edge, idx, wr->wr_id, nreq,
+			   fence, mlx5_ib_opcode[wr->opcode]);
+skip_psv:
+		if (0)
+			dump_wqe(qp, idx, size);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->sq.head += nreq;
+
+		/* Make sure that descriptors are written before
+		 * updating doorbell record and ringing the doorbell
+		 */
+		wmb();
+
+		qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post);
+
+		/* Make sure doorbell record is visible to the HCA before
+		 * we hit doorbell.
+		 */
+		wmb();
+
+		mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset);
+		/* Make sure doorbells don't leak out of SQ spinlock
+		 * and reach the HCA out of order.
+		 */
+		bf->offset ^= bf->buf_size;
+	}
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+	return err;
+}
+
+static void set_sig_seg(struct mlx5_rwqe_sig *sig, int max_gs)
+{
+	 sig->signature = calc_sig(sig, (max_gs + 1) << 2);
+}
+
+int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
+		      const struct ib_recv_wr **bad_wr, bool drain)
+{
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_wqe_data_seg *scat;
+	struct mlx5_rwqe_sig *sig;
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int i;
+
+	if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR &&
+		     !drain)) {
+		*bad_wr = wr;
+		return -EIO;
+	}
+
+	if (unlikely(ibqp->qp_type == IB_QPT_GSI))
+		return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr);
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+	for (nreq = 0; wr; nreq++, wr = wr->next) {
+		if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		scat = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ind);
+		if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE)
+			scat++;
+
+		for (i = 0; i < wr->num_sge; i++)
+			set_data_ptr_seg(scat + i, wr->sg_list + i);
+
+		if (i < qp->rq.max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX5_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+
+		if (qp->flags_en & MLX5_QP_FLAG_SIGNATURE) {
+			sig = (struct mlx5_rwqe_sig *)scat;
+			set_sig_seg(sig, qp->rq.max_gs);
+		}
+
+		qp->rq.wrid[ind] = wr->wr_id;
+
+		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/* Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/wr.h b/drivers/infiniband/hw/mlx5/wr.h
new file mode 100644
index 0000000..4f00575
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/wr.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved.
+ */
+
+#ifndef _MLX5_IB_WR_H
+#define _MLX5_IB_WR_H
+
+#include "mlx5_ib.h"
+
+enum {
+	MLX5_IB_SQ_UMR_INLINE_THRESHOLD = 64,
+};
+
+struct mlx5_wqe_eth_pad {
+	u8 rsvd0[16];
+};
+
+
+/* get_sq_edge - Get the next nearby edge.
+ *
+ * An 'edge' is defined as the first following address after the end
+ * of the fragment or the SQ. Accordingly, during the WQE construction
+ * which repetitively increases the pointer to write the next data, it
+ * simply should check if it gets to an edge.
+ *
+ * @sq - SQ buffer.
+ * @idx - Stride index in the SQ buffer.
+ *
+ * Return:
+ *	The new edge.
+ */
+static inline void *get_sq_edge(struct mlx5_ib_wq *sq, u32 idx)
+{
+	void *fragment_end;
+
+	fragment_end = mlx5_frag_buf_get_wqe
+		(&sq->fbc,
+		 mlx5_frag_buf_get_idx_last_contig_stride(&sq->fbc, idx));
+
+	return fragment_end + MLX5_SEND_WQE_BB;
+}
+
+int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
+		      const struct ib_send_wr **bad_wr, bool drain);
+int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
+		      const struct ib_recv_wr **bad_wr, bool drain);
+
+static inline int mlx5_ib_post_send_nodrain(struct ib_qp *ibqp,
+					    const struct ib_send_wr *wr,
+					    const struct ib_send_wr **bad_wr)
+{
+	return mlx5_ib_post_send(ibqp, wr, bad_wr, false);
+}
+
+static inline int mlx5_ib_post_send_drain(struct ib_qp *ibqp,
+					  const struct ib_send_wr *wr,
+					  const struct ib_send_wr **bad_wr)
+{
+	return mlx5_ib_post_send(ibqp, wr, bad_wr, true);
+}
+
+static inline int mlx5_ib_post_recv_nodrain(struct ib_qp *ibqp,
+					    const struct ib_recv_wr *wr,
+					    const struct ib_recv_wr **bad_wr)
+{
+	return mlx5_ib_post_recv(ibqp, wr, bad_wr, false);
+}
+
+static inline int mlx5_ib_post_recv_drain(struct ib_qp *ibqp,
+					  const struct ib_recv_wr *wr,
+					  const struct ib_recv_wr **bad_wr)
+{
+	return mlx5_ib_post_recv(ibqp, wr, bad_wr, true);
+}
+#endif /* _MLX5_IB_WR_H */
diff --git a/drivers/infiniband/hw/mthca/Kconfig b/drivers/infiniband/hw/mthca/Kconfig
index 66ff527..faa7381 100644
--- a/drivers/infiniband/hw/mthca/Kconfig
+++ b/drivers/infiniband/hw/mthca/Kconfig
@@ -2,7 +2,7 @@
 config INFINIBAND_MTHCA
 	tristate "Mellanox HCA support"
 	depends on PCI
-	---help---
+	help
 	  This is a low-level driver for Mellanox InfiniHost host
 	  channel adapters (HCAs), including the MT23108 PCI-X HCA
 	  ("Tavor") and the MT25208 PCI Express HCA ("Arbel").
@@ -11,7 +11,7 @@
 	bool "Verbose debugging output" if EXPERT
 	depends on INFINIBAND_MTHCA
 	default y
-	---help---
+	help
 	  This option causes debugging code to be compiled into the
 	  mthca driver.  The output can be turned on via the
 	  debug_level module parameter (which can also be set after
diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c
index 0823c0b..f051f4e 100644
--- a/drivers/infiniband/hw/mthca/mthca_av.c
+++ b/drivers/infiniband/hw/mthca/mthca_av.c
@@ -115,7 +115,7 @@
 	switch ((cur_rate - 1) / req_rate) {
 	case 0:	 return MTHCA_RATE_MEMFREE_FULL;
 	case 1:	 return MTHCA_RATE_MEMFREE_HALF;
-	case 2:	 /* fall through */
+	case 2:
 	case 3:	 return MTHCA_RATE_MEMFREE_QUARTER;
 	default: return MTHCA_RATE_MEMFREE_EIGHTH;
 	}
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index 58d4644..a445160 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -477,16 +477,6 @@
 			u32 access, struct mthca_mr *mr);
 void mthca_free_mr(struct mthca_dev *dev,  struct mthca_mr *mr);
 
-int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
-		    u32 access, struct mthca_fmr *fmr);
-int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
-			     int list_len, u64 iova);
-void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
-int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
-			     int list_len, u64 iova);
-void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
-int mthca_free_fmr(struct mthca_dev *dev,  struct mthca_fmr *fmr);
-
 int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt);
 void mthca_unmap_eq_icm(struct mthca_dev *dev);
 
@@ -557,7 +547,7 @@
 		    struct ib_qp_cap *cap,
 		    int qpn,
 		    int port,
-		    struct mthca_sqp *sqp,
+		    struct mthca_qp *qp,
 		    struct ib_udata *udata);
 void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp);
 int mthca_create_ah(struct mthca_dev *dev,
@@ -575,14 +565,10 @@
 int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
 int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
 
-int mthca_process_mad(struct ib_device *ibdev,
-		      int mad_flags,
-		      u8 port_num,
-		      const struct ib_wc *in_wc,
-		      const struct ib_grh *in_grh,
-		      const struct ib_mad_hdr *in, size_t in_mad_size,
-		      struct ib_mad_hdr *out, size_t *out_mad_size,
-		      u16 *out_mad_pkey_index);
+int mthca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		      const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		      const struct ib_mad *in, struct ib_mad *out,
+		      size_t *out_mad_size, u16 *out_mad_pkey_index);
 int mthca_create_agents(struct mthca_dev *dev);
 void mthca_free_agents(struct mthca_dev *dev);
 
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
index 7ad517d..99aa818 100644
--- a/drivers/infiniband/hw/mthca/mthca_mad.c
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -196,30 +196,19 @@
 	}
 }
 
-int mthca_process_mad(struct ib_device *ibdev,
-		      int mad_flags,
-		      u8 port_num,
-		      const struct ib_wc *in_wc,
-		      const struct ib_grh *in_grh,
-		      const struct ib_mad_hdr *in, size_t in_mad_size,
-		      struct ib_mad_hdr *out, size_t *out_mad_size,
-		      u16 *out_mad_pkey_index)
+int mthca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		      const struct ib_wc *in_wc, const struct ib_grh *in_grh,
+		      const struct ib_mad *in, struct ib_mad *out,
+		      size_t *out_mad_size, u16 *out_mad_pkey_index)
 {
 	int err;
 	u16 slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
 	u16 prev_lid = 0;
 	struct ib_port_attr pattr;
-	const struct ib_mad *in_mad = (const struct ib_mad *)in;
-	struct ib_mad *out_mad = (struct ib_mad *)out;
-
-	if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
-			 *out_mad_size != sizeof(*out_mad)))
-		return IB_MAD_RESULT_FAILURE;
 
 	/* Forward locally generated traps to the SM */
-	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP &&
-	    slid == 0) {
-		forward_trap(to_mdev(ibdev), port_num, in_mad);
+	if (in->mad_hdr.method == IB_MGMT_METHOD_TRAP && !slid) {
+		forward_trap(to_mdev(ibdev), port_num, in);
 		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
 	}
 
@@ -229,40 +218,39 @@
 	 * Only handle PMA and Mellanox vendor-specific class gets and
 	 * sets for other classes.
 	 */
-	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
-	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
-		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
-		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
-		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+	if (in->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	    in->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in->mad_hdr.method != IB_MGMT_METHOD_GET &&
+		    in->mad_hdr.method != IB_MGMT_METHOD_SET &&
+		    in->mad_hdr.method != IB_MGMT_METHOD_TRAP_REPRESS)
 			return IB_MAD_RESULT_SUCCESS;
 
 		/*
 		 * Don't process SMInfo queries or vendor-specific
 		 * MADs -- the SMA can't handle them.
 		 */
-		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO ||
-		    ((in_mad->mad_hdr.attr_id & IB_SMP_ATTR_VENDOR_MASK) ==
+		if (in->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO ||
+		    ((in->mad_hdr.attr_id & IB_SMP_ATTR_VENDOR_MASK) ==
 		     IB_SMP_ATTR_VENDOR_MASK))
 			return IB_MAD_RESULT_SUCCESS;
-	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
-		   in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS1     ||
-		   in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS2) {
-		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
-		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+	} else if (in->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS1     ||
+		   in->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS2) {
+		if (in->mad_hdr.method != IB_MGMT_METHOD_GET &&
+		    in->mad_hdr.method != IB_MGMT_METHOD_SET)
 			return IB_MAD_RESULT_SUCCESS;
 	} else
 		return IB_MAD_RESULT_SUCCESS;
-	if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
-	     in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
-	    in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
-	    in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+	if ((in->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     in->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    in->mad_hdr.method == IB_MGMT_METHOD_SET &&
+	    in->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
 	    !ib_query_port(ibdev, port_num, &pattr))
 		prev_lid = ib_lid_cpu16(pattr.lid);
 
-	err = mthca_MAD_IFC(to_mdev(ibdev),
-			    mad_flags & IB_MAD_IGNORE_MKEY,
-			    mad_flags & IB_MAD_IGNORE_BKEY,
-			    port_num, in_wc, in_grh, in_mad, out_mad);
+	err = mthca_MAD_IFC(to_mdev(ibdev), mad_flags & IB_MAD_IGNORE_MKEY,
+			    mad_flags & IB_MAD_IGNORE_BKEY, port_num, in_wc,
+			    in_grh, in, out);
 	if (err == -EBADMSG)
 		return IB_MAD_RESULT_SUCCESS;
 	else if (err) {
@@ -270,16 +258,16 @@
 		return IB_MAD_RESULT_FAILURE;
 	}
 
-	if (!out_mad->mad_hdr.status) {
-		smp_snoop(ibdev, port_num, in_mad, prev_lid);
-		node_desc_override(ibdev, out_mad);
+	if (!out->mad_hdr.status) {
+		smp_snoop(ibdev, port_num, in, prev_lid);
+		node_desc_override(ibdev, out);
 	}
 
 	/* set return bit in status of directed route responses */
-	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
-		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+	if (in->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out->mad_hdr.status |= cpu_to_be16(1 << 15);
 
-	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+	if (in->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
 		/* no response for trap repress */
 		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
 
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
index edccfd6..fa80858 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -58,7 +58,7 @@
 		u64                uvirt;
 		struct scatterlist mem;
 		int                refcount;
-	}                page[0];
+	} page[];
 };
 
 static void mthca_free_icm_pages(struct mthca_dev *dev, struct mthca_icm_chunk *chunk)
@@ -472,7 +472,7 @@
 		goto out;
 	}
 
-	ret = get_user_pages_fast(uaddr & PAGE_MASK, 1,
+	ret = pin_user_pages_fast(uaddr & PAGE_MASK, 1,
 				  FOLL_WRITE | FOLL_LONGTERM, pages);
 	if (ret < 0)
 		goto out;
@@ -482,7 +482,7 @@
 
 	ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
 	if (ret < 0) {
-		put_user_page(pages[0]);
+		unpin_user_page(pages[0]);
 		goto out;
 	}
 
@@ -490,7 +490,7 @@
 				 mthca_uarc_virt(dev, uar, i));
 	if (ret) {
 		pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
-		put_user_page(sg_page(&db_tab->page[i].mem));
+		unpin_user_page(sg_page(&db_tab->page[i].mem));
 		goto out;
 	}
 
@@ -556,7 +556,7 @@
 		if (db_tab->page[i].uvirt) {
 			mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1);
 			pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
-			put_user_page(sg_page(&db_tab->page[i].mem));
+			unpin_user_page(sg_page(&db_tab->page[i].mem));
 		}
 	}
 
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.h b/drivers/infiniband/hw/mthca/mthca_memfree.h
index da9b8f9..f9a2e65 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.h
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.h
@@ -68,7 +68,7 @@
 	int               lowmem;
 	int               coherent;
 	struct mutex      mutex;
-	struct mthca_icm *icm[0];
+	struct mthca_icm *icm[];
 };
 
 struct mthca_icm_iter {
diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c
index 4250b2c..ce0e086 100644
--- a/drivers/infiniband/hw/mthca/mthca_mr.c
+++ b/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -541,7 +541,7 @@
 	return err;
 }
 
-/* Free mr or fmr */
+/* Free mr */
 static void mthca_free_region(struct mthca_dev *dev, u32 lkey)
 {
 	mthca_table_put(dev, dev->mr_table.mpt_table,
@@ -564,266 +564,6 @@
 	mthca_free_mtt(dev, mr->mtt);
 }
 
-int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
-		    u32 access, struct mthca_fmr *mr)
-{
-	struct mthca_mpt_entry *mpt_entry;
-	struct mthca_mailbox *mailbox;
-	u64 mtt_seg;
-	u32 key, idx;
-	int list_len = mr->attr.max_pages;
-	int err = -ENOMEM;
-	int i;
-
-	if (mr->attr.page_shift < 12 || mr->attr.page_shift >= 32)
-		return -EINVAL;
-
-	/* For Arbel, all MTTs must fit in the same page. */
-	if (mthca_is_memfree(dev) &&
-	    mr->attr.max_pages * sizeof *mr->mem.arbel.mtts > PAGE_SIZE)
-		return -EINVAL;
-
-	mr->maps = 0;
-
-	key = mthca_alloc(&dev->mr_table.mpt_alloc);
-	if (key == -1)
-		return -ENOMEM;
-	key = adjust_key(dev, key);
-
-	idx = key & (dev->limits.num_mpts - 1);
-	mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
-
-	if (mthca_is_memfree(dev)) {
-		err = mthca_table_get(dev, dev->mr_table.mpt_table, key);
-		if (err)
-			goto err_out_mpt_free;
-
-		mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key, NULL);
-		BUG_ON(!mr->mem.arbel.mpt);
-	} else
-		mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base +
-			sizeof *(mr->mem.tavor.mpt) * idx;
-
-	mr->mtt = __mthca_alloc_mtt(dev, list_len, dev->mr_table.fmr_mtt_buddy);
-	if (IS_ERR(mr->mtt)) {
-		err = PTR_ERR(mr->mtt);
-		goto err_out_table;
-	}
-
-	mtt_seg = mr->mtt->first_seg * dev->limits.mtt_seg_size;
-
-	if (mthca_is_memfree(dev)) {
-		mr->mem.arbel.mtts = mthca_table_find(dev->mr_table.mtt_table,
-						      mr->mtt->first_seg,
-						      &mr->mem.arbel.dma_handle);
-		BUG_ON(!mr->mem.arbel.mtts);
-	} else
-		mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg;
-
-	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
-	if (IS_ERR(mailbox)) {
-		err = PTR_ERR(mailbox);
-		goto err_out_free_mtt;
-	}
-
-	mpt_entry = mailbox->buf;
-
-	mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
-				       MTHCA_MPT_FLAG_MIO         |
-				       MTHCA_MPT_FLAG_REGION      |
-				       access);
-
-	mpt_entry->page_size = cpu_to_be32(mr->attr.page_shift - 12);
-	mpt_entry->key       = cpu_to_be32(key);
-	mpt_entry->pd        = cpu_to_be32(pd);
-	memset(&mpt_entry->start, 0,
-	       sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, start));
-	mpt_entry->mtt_seg   = cpu_to_be64(dev->mr_table.mtt_base + mtt_seg);
-
-	if (0) {
-		mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
-		for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
-			if (i % 4 == 0)
-				printk("[%02x] ", i * 4);
-			printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i]));
-			if ((i + 1) % 4 == 0)
-				printk("\n");
-		}
-	}
-
-	err = mthca_SW2HW_MPT(dev, mailbox,
-			      key & (dev->limits.num_mpts - 1));
-	if (err) {
-		mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
-		goto err_out_mailbox_free;
-	}
-
-	mthca_free_mailbox(dev, mailbox);
-	return 0;
-
-err_out_mailbox_free:
-	mthca_free_mailbox(dev, mailbox);
-
-err_out_free_mtt:
-	mthca_free_mtt(dev, mr->mtt);
-
-err_out_table:
-	mthca_table_put(dev, dev->mr_table.mpt_table, key);
-
-err_out_mpt_free:
-	mthca_free(&dev->mr_table.mpt_alloc, key);
-	return err;
-}
-
-int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr)
-{
-	if (fmr->maps)
-		return -EBUSY;
-
-	mthca_free_region(dev, fmr->ibmr.lkey);
-	mthca_free_mtt(dev, fmr->mtt);
-
-	return 0;
-}
-
-static inline int mthca_check_fmr(struct mthca_fmr *fmr, u64 *page_list,
-				  int list_len, u64 iova)
-{
-	int i, page_mask;
-
-	if (list_len > fmr->attr.max_pages)
-		return -EINVAL;
-
-	page_mask = (1 << fmr->attr.page_shift) - 1;
-
-	/* We are getting page lists, so va must be page aligned. */
-	if (iova & page_mask)
-		return -EINVAL;
-
-	/* Trust the user not to pass misaligned data in page_list */
-	if (0)
-		for (i = 0; i < list_len; ++i) {
-			if (page_list[i] & ~page_mask)
-				return -EINVAL;
-		}
-
-	if (fmr->maps >= fmr->attr.max_maps)
-		return -EINVAL;
-
-	return 0;
-}
-
-
-int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
-			     int list_len, u64 iova)
-{
-	struct mthca_fmr *fmr = to_mfmr(ibfmr);
-	struct mthca_dev *dev = to_mdev(ibfmr->device);
-	struct mthca_mpt_entry mpt_entry;
-	u32 key;
-	int i, err;
-
-	err = mthca_check_fmr(fmr, page_list, list_len, iova);
-	if (err)
-		return err;
-
-	++fmr->maps;
-
-	key = tavor_key_to_hw_index(fmr->ibmr.lkey);
-	key += dev->limits.num_mpts;
-	fmr->ibmr.lkey = fmr->ibmr.rkey = tavor_hw_index_to_key(key);
-
-	writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
-
-	for (i = 0; i < list_len; ++i) {
-		__be64 mtt_entry = cpu_to_be64(page_list[i] |
-					       MTHCA_MTT_FLAG_PRESENT);
-		mthca_write64_raw(mtt_entry, fmr->mem.tavor.mtts + i);
-	}
-
-	mpt_entry.lkey   = cpu_to_be32(key);
-	mpt_entry.length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift));
-	mpt_entry.start  = cpu_to_be64(iova);
-
-	__raw_writel((__force u32) mpt_entry.lkey, &fmr->mem.tavor.mpt->key);
-	memcpy_toio(&fmr->mem.tavor.mpt->start, &mpt_entry.start,
-		    offsetof(struct mthca_mpt_entry, window_count) -
-		    offsetof(struct mthca_mpt_entry, start));
-
-	writeb(MTHCA_MPT_STATUS_HW, fmr->mem.tavor.mpt);
-
-	return 0;
-}
-
-int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
-			     int list_len, u64 iova)
-{
-	struct mthca_fmr *fmr = to_mfmr(ibfmr);
-	struct mthca_dev *dev = to_mdev(ibfmr->device);
-	u32 key;
-	int i, err;
-
-	err = mthca_check_fmr(fmr, page_list, list_len, iova);
-	if (err)
-		return err;
-
-	++fmr->maps;
-
-	key = arbel_key_to_hw_index(fmr->ibmr.lkey);
-	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
-		key += SINAI_FMR_KEY_INC;
-	else
-		key += dev->limits.num_mpts;
-	fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key);
-
-	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW;
-
-	wmb();
-
-	dma_sync_single_for_cpu(&dev->pdev->dev, fmr->mem.arbel.dma_handle,
-				list_len * sizeof(u64), DMA_TO_DEVICE);
-
-	for (i = 0; i < list_len; ++i)
-		fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] |
-						     MTHCA_MTT_FLAG_PRESENT);
-
-	dma_sync_single_for_device(&dev->pdev->dev, fmr->mem.arbel.dma_handle,
-				   list_len * sizeof(u64), DMA_TO_DEVICE);
-
-	fmr->mem.arbel.mpt->key    = cpu_to_be32(key);
-	fmr->mem.arbel.mpt->lkey   = cpu_to_be32(key);
-	fmr->mem.arbel.mpt->length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift));
-	fmr->mem.arbel.mpt->start  = cpu_to_be64(iova);
-
-	wmb();
-
-	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_HW;
-
-	wmb();
-
-	return 0;
-}
-
-void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
-{
-	if (!fmr->maps)
-		return;
-
-	fmr->maps = 0;
-
-	writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
-}
-
-void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
-{
-	if (!fmr->maps)
-		return;
-
-	fmr->maps = 0;
-
-	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW;
-}
-
 int mthca_init_mr_table(struct mthca_dev *dev)
 {
 	phys_addr_t addr;
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
index 23554d8..c4d9cdc 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.c
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -118,16 +118,6 @@
 	props->max_mcast_qp_attach = MTHCA_QP_PER_MGM;
 	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
 					   props->max_mcast_grp;
-	/*
-	 * If Sinai memory key optimization is being used, then only
-	 * the 8-bit key portion will change.  For other HCAs, the
-	 * unused index bits will also be used for FMR remapping.
-	 */
-	if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
-		props->max_map_per_fmr = 255;
-	else
-		props->max_map_per_fmr =
-			(1 << (32 - ilog2(mdev->limits.num_mpts))) - 1;
 
 	err = 0;
  out:
@@ -383,24 +373,27 @@
 	return 0;
 }
 
-static void mthca_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+static int mthca_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
 	mthca_pd_free(to_mdev(pd->device), to_mpd(pd));
+	return 0;
 }
 
-static int mthca_ah_create(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
-			   u32 flags, struct ib_udata *udata)
+static int mthca_ah_create(struct ib_ah *ibah,
+			   struct rdma_ah_init_attr *init_attr,
+			   struct ib_udata *udata)
 
 {
 	struct mthca_ah *ah = to_mah(ibah);
 
-	return mthca_create_ah(to_mdev(ibah->device), to_mpd(ibah->pd), ah_attr,
-			       ah);
+	return mthca_create_ah(to_mdev(ibah->device), to_mpd(ibah->pd),
+			       init_attr->ah_attr, ah);
 }
 
-static void mthca_ah_destroy(struct ib_ah *ah, u32 flags)
+static int mthca_ah_destroy(struct ib_ah *ah, u32 flags)
 {
 	mthca_destroy_ah(to_mdev(ah->device), to_mah(ah));
+	return 0;
 }
 
 static int mthca_create_srq(struct ib_srq *ibsrq,
@@ -449,7 +442,7 @@
 	return 0;
 }
 
-static void mthca_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
+static int mthca_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
 {
 	if (udata) {
 		struct mthca_ucontext *context =
@@ -463,6 +456,7 @@
 	}
 
 	mthca_free_srq(to_mdev(srq->device), to_msrq(srq));
+	return 0;
 }
 
 static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
@@ -541,13 +535,14 @@
 	case IB_QPT_SMI:
 	case IB_QPT_GSI:
 	{
-		/* Don't allow userspace to create special QPs */
-		if (udata)
-			return ERR_PTR(-EINVAL);
-
-		qp = kzalloc(sizeof(struct mthca_sqp), GFP_KERNEL);
+		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
 		if (!qp)
 			return ERR_PTR(-ENOMEM);
+		qp->sqp = kzalloc(sizeof(struct mthca_sqp), GFP_KERNEL);
+		if (!qp->sqp) {
+			kfree(qp);
+			return ERR_PTR(-ENOMEM);
+		}
 
 		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
 
@@ -556,15 +551,16 @@
 				      to_mcq(init_attr->recv_cq),
 				      init_attr->sq_sig_type, &init_attr->cap,
 				      qp->ibqp.qp_num, init_attr->port_num,
-				      to_msqp(qp), udata);
+				      qp, udata);
 		break;
 	}
 	default:
 		/* Don't support raw QPs */
-		return ERR_PTR(-ENOSYS);
+		return ERR_PTR(-EOPNOTSUPP);
 	}
 
 	if (err) {
+		kfree(qp->sqp);
 		kfree(qp);
 		return ERR_PTR(err);
 	}
@@ -597,7 +593,8 @@
 				    to_mqp(qp)->rq.db_index);
 	}
 	mthca_free_qp(to_mdev(qp->device), to_mqp(qp));
-	kfree(qp);
+	kfree(to_mqp(qp)->sqp);
+	kfree(to_mqp(qp));
 	return 0;
 }
 
@@ -798,7 +795,7 @@
 	return ret;
 }
 
-static void mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+static int mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
 	if (udata) {
 		struct mthca_ucontext *context =
@@ -817,6 +814,7 @@
 				    to_mcq(cq)->set_ci_db_index);
 	}
 	mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
+	return 0;
 }
 
 static inline u32 convert_access(int acc)
@@ -855,7 +853,7 @@
 				       u64 virt, int acc, struct ib_udata *udata)
 {
 	struct mthca_dev *dev = to_mdev(pd->device);
-	struct sg_dma_page_iter sg_iter;
+	struct ib_block_iter biter;
 	struct mthca_ucontext *context = rdma_udata_to_drv_context(
 		udata, struct mthca_ucontext, ibucontext);
 	struct mthca_mr *mr;
@@ -880,15 +878,13 @@
 	if (!mr)
 		return ERR_PTR(-ENOMEM);
 
-	mr->umem = ib_umem_get(udata, start, length, acc,
-			       ucmd.mr_attrs & MTHCA_MR_DMASYNC);
-
+	mr->umem = ib_umem_get(pd->device, start, length, acc);
 	if (IS_ERR(mr->umem)) {
 		err = PTR_ERR(mr->umem);
 		goto err;
 	}
 
-	n = ib_umem_num_pages(mr->umem);
+	n = ib_umem_num_dma_blocks(mr->umem, PAGE_SIZE);
 
 	mr->mtt = mthca_alloc_mtt(dev, n);
 	if (IS_ERR(mr->mtt)) {
@@ -906,8 +902,8 @@
 
 	write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages));
 
-	for_each_sg_dma_page(mr->umem->sg_head.sgl, &sg_iter, mr->umem->nmap, 0) {
-		pages[i++] = sg_page_iter_dma_address(&sg_iter);
+	rdma_umem_for_each_dma_block(mr->umem, &biter, PAGE_SIZE) {
+		pages[i++] = rdma_block_iter_dma_address(&biter);
 
 		/*
 		 * Be friendly to write_mtt and pass it chunks
@@ -959,69 +955,6 @@
 	return 0;
 }
 
-static struct ib_fmr *mthca_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
-				      struct ib_fmr_attr *fmr_attr)
-{
-	struct mthca_fmr *fmr;
-	int err;
-
-	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
-	if (!fmr)
-		return ERR_PTR(-ENOMEM);
-
-	memcpy(&fmr->attr, fmr_attr, sizeof *fmr_attr);
-	err = mthca_fmr_alloc(to_mdev(pd->device), to_mpd(pd)->pd_num,
-			     convert_access(mr_access_flags), fmr);
-
-	if (err) {
-		kfree(fmr);
-		return ERR_PTR(err);
-	}
-
-	return &fmr->ibmr;
-}
-
-static int mthca_dealloc_fmr(struct ib_fmr *fmr)
-{
-	struct mthca_fmr *mfmr = to_mfmr(fmr);
-	int err;
-
-	err = mthca_free_fmr(to_mdev(fmr->device), mfmr);
-	if (err)
-		return err;
-
-	kfree(mfmr);
-	return 0;
-}
-
-static int mthca_unmap_fmr(struct list_head *fmr_list)
-{
-	struct ib_fmr *fmr;
-	int err;
-	struct mthca_dev *mdev = NULL;
-
-	list_for_each_entry(fmr, fmr_list, list) {
-		if (mdev && to_mdev(fmr->device) != mdev)
-			return -EINVAL;
-		mdev = to_mdev(fmr->device);
-	}
-
-	if (!mdev)
-		return 0;
-
-	if (mthca_is_memfree(mdev)) {
-		list_for_each_entry(fmr, fmr_list, list)
-			mthca_arbel_fmr_unmap(mdev, to_mfmr(fmr));
-
-		wmb();
-	} else
-		list_for_each_entry(fmr, fmr_list, list)
-			mthca_tavor_fmr_unmap(mdev, to_mfmr(fmr));
-
-	err = mthca_SYNC_TPT(mdev);
-	return err;
-}
-
 static ssize_t hw_rev_show(struct device *device,
 			   struct device_attribute *attr, char *buf)
 {
@@ -1205,20 +1138,6 @@
 	INIT_RDMA_OBJ_SIZE(ib_srq, mthca_srq, ibsrq),
 };
 
-static const struct ib_device_ops mthca_dev_arbel_fmr_ops = {
-	.alloc_fmr = mthca_alloc_fmr,
-	.dealloc_fmr = mthca_dealloc_fmr,
-	.map_phys_fmr = mthca_arbel_map_phys_fmr,
-	.unmap_fmr = mthca_unmap_fmr,
-};
-
-static const struct ib_device_ops mthca_dev_tavor_fmr_ops = {
-	.alloc_fmr = mthca_alloc_fmr,
-	.dealloc_fmr = mthca_dealloc_fmr,
-	.map_phys_fmr = mthca_tavor_map_phys_fmr,
-	.unmap_fmr = mthca_unmap_fmr,
-};
-
 static const struct ib_device_ops mthca_dev_arbel_ops = {
 	.post_recv = mthca_arbel_post_receive,
 	.post_send = mthca_arbel_post_send,
@@ -1277,15 +1196,6 @@
 					  &mthca_dev_tavor_srq_ops);
 	}
 
-	if (dev->mthca_flags & MTHCA_FLAG_FMR) {
-		if (mthca_is_memfree(dev))
-			ib_set_device_ops(&dev->ib_dev,
-					  &mthca_dev_arbel_fmr_ops);
-		else
-			ib_set_device_ops(&dev->ib_dev,
-					  &mthca_dev_tavor_fmr_ops);
-	}
-
 	ib_set_device_ops(&dev->ib_dev, &mthca_dev_ops);
 
 	if (mthca_is_memfree(dev))
@@ -1296,7 +1206,7 @@
 	mutex_init(&dev->cap_mask_mutex);
 
 	rdma_set_device_sysfs_group(&dev->ib_dev, &mthca_attr_group);
-	ret = ib_register_device(&dev->ib_dev, "mthca%d");
+	ret = ib_register_device(&dev->ib_dev, "mthca%d", &dev->pdev->dev);
 	if (ret)
 		return ret;
 
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h
index 596acc4..8a77483 100644
--- a/drivers/infiniband/hw/mthca/mthca_provider.h
+++ b/drivers/infiniband/hw/mthca/mthca_provider.h
@@ -76,24 +76,6 @@
 	struct mthca_mtt *mtt;
 };
 
-struct mthca_fmr {
-	struct ib_fmr      ibmr;
-	struct ib_fmr_attr attr;
-	struct mthca_mtt  *mtt;
-	int                maps;
-	union {
-		struct {
-			struct mthca_mpt_entry __iomem *mpt;
-			u64 __iomem *mtts;
-		} tavor;
-		struct {
-			struct mthca_mpt_entry *mpt;
-			__be64 *mtts;
-			dma_addr_t dma_handle;
-		} arbel;
-	} mem;
-};
-
 struct mthca_pd {
 	struct ib_pd    ibpd;
 	u32             pd_num;
@@ -258,6 +240,16 @@
 	__be32    *db;
 };
 
+struct mthca_sqp {
+	int             pkey_index;
+	u32             qkey;
+	u32             send_psn;
+	struct ib_ud_header ud_header;
+	int             header_buf_size;
+	void           *header_buf;
+	dma_addr_t      header_dma;
+};
+
 struct mthca_qp {
 	struct ib_qp           ibqp;
 	int                    refcount;
@@ -283,17 +275,7 @@
 
 	wait_queue_head_t      wait;
 	struct mutex	       mutex;
-};
-
-struct mthca_sqp {
-	struct mthca_qp qp;
-	int             pkey_index;
-	u32             qkey;
-	u32             send_psn;
-	struct ib_ud_header ud_header;
-	int             header_buf_size;
-	void           *header_buf;
-	dma_addr_t      header_dma;
+	struct mthca_sqp *sqp;
 };
 
 static inline struct mthca_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
@@ -301,11 +283,6 @@
 	return container_of(ibucontext, struct mthca_ucontext, ibucontext);
 }
 
-static inline struct mthca_fmr *to_mfmr(struct ib_fmr *ibmr)
-{
-	return container_of(ibmr, struct mthca_fmr, ibmr);
-}
-
 static inline struct mthca_mr *to_mmr(struct ib_mr *ibmr)
 {
 	return container_of(ibmr, struct mthca_mr, ibmr);
@@ -336,9 +313,4 @@
 	return container_of(ibqp, struct mthca_qp, ibqp);
 }
 
-static inline struct mthca_sqp *to_msqp(struct mthca_qp *qp)
-{
-	return container_of(qp, struct mthca_sqp, qp);
-}
-
 #endif /* MTHCA_PROVIDER_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index d04c245..08a2a7a 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -809,7 +809,7 @@
 		qp->alt_port = attr->alt_port_num;
 
 	if (is_sqp(dev, qp))
-		store_attrs(to_msqp(qp), attr, attr_mask);
+		store_attrs(qp->sqp, attr, attr_mask);
 
 	/*
 	 * If we moved QP0 to RTR, bring the IB link up; if we moved
@@ -1368,39 +1368,40 @@
 		    struct ib_qp_cap *cap,
 		    int qpn,
 		    int port,
-		    struct mthca_sqp *sqp,
+		    struct mthca_qp *qp,
 		    struct ib_udata *udata)
 {
 	u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1;
 	int err;
 
-	sqp->qp.transport = MLX;
-	err = mthca_set_qp_size(dev, cap, pd, &sqp->qp);
+	qp->transport = MLX;
+	err = mthca_set_qp_size(dev, cap, pd, qp);
 	if (err)
 		return err;
 
-	sqp->header_buf_size = sqp->qp.sq.max * MTHCA_UD_HEADER_SIZE;
-	sqp->header_buf = dma_alloc_coherent(&dev->pdev->dev, sqp->header_buf_size,
-					     &sqp->header_dma, GFP_KERNEL);
-	if (!sqp->header_buf)
+	qp->sqp->header_buf_size = qp->sq.max * MTHCA_UD_HEADER_SIZE;
+	qp->sqp->header_buf =
+		dma_alloc_coherent(&dev->pdev->dev, qp->sqp->header_buf_size,
+				   &qp->sqp->header_dma, GFP_KERNEL);
+	if (!qp->sqp->header_buf)
 		return -ENOMEM;
 
 	spin_lock_irq(&dev->qp_table.lock);
 	if (mthca_array_get(&dev->qp_table.qp, mqpn))
 		err = -EBUSY;
 	else
-		mthca_array_set(&dev->qp_table.qp, mqpn, sqp);
+		mthca_array_set(&dev->qp_table.qp, mqpn, qp->sqp);
 	spin_unlock_irq(&dev->qp_table.lock);
 
 	if (err)
 		goto err_out;
 
-	sqp->qp.port      = port;
-	sqp->qp.qpn       = mqpn;
-	sqp->qp.transport = MLX;
+	qp->port      = port;
+	qp->qpn       = mqpn;
+	qp->transport = MLX;
 
 	err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
-				    send_policy, &sqp->qp, udata);
+				    send_policy, qp, udata);
 	if (err)
 		goto err_out_free;
 
@@ -1421,10 +1422,9 @@
 
 	mthca_unlock_cqs(send_cq, recv_cq);
 
- err_out:
-	dma_free_coherent(&dev->pdev->dev, sqp->header_buf_size,
-			  sqp->header_buf, sqp->header_dma);
-
+err_out:
+	dma_free_coherent(&dev->pdev->dev, qp->sqp->header_buf_size,
+			  qp->sqp->header_buf, qp->sqp->header_dma);
 	return err;
 }
 
@@ -1487,20 +1487,19 @@
 
 	if (is_sqp(dev, qp)) {
 		atomic_dec(&(to_mpd(qp->ibqp.pd)->sqp_count));
-		dma_free_coherent(&dev->pdev->dev,
-				  to_msqp(qp)->header_buf_size,
-				  to_msqp(qp)->header_buf,
-				  to_msqp(qp)->header_dma);
+		dma_free_coherent(&dev->pdev->dev, qp->sqp->header_buf_size,
+				  qp->sqp->header_buf, qp->sqp->header_dma);
 	} else
 		mthca_free(&dev->qp_table.alloc, qp->qpn);
 }
 
 /* Create UD header for an MLX send and build a data segment for it */
-static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp,
-			    int ind, const struct ib_ud_wr *wr,
+static int build_mlx_header(struct mthca_dev *dev, struct mthca_qp *qp, int ind,
+			    const struct ib_ud_wr *wr,
 			    struct mthca_mlx_seg *mlx,
 			    struct mthca_data_seg *data)
 {
+	struct mthca_sqp *sqp = qp->sqp;
 	int header_size;
 	int err;
 	u16 pkey;
@@ -1513,7 +1512,7 @@
 	if (err)
 		return err;
 	mlx->flags &= ~cpu_to_be32(MTHCA_NEXT_SOLICIT | 1);
-	mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MTHCA_MLX_VL15 : 0) |
+	mlx->flags |= cpu_to_be32((!qp->ibqp.qp_num ? MTHCA_MLX_VL15 : 0) |
 				  (sqp->ud_header.lrh.destination_lid ==
 				   IB_LID_PERMISSIVE ? MTHCA_MLX_SLR : 0) |
 				  (sqp->ud_header.lrh.service_level << 8));
@@ -1534,29 +1533,29 @@
 		return -EINVAL;
 	}
 
-	sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
+	sqp->ud_header.lrh.virtual_lane    = !qp->ibqp.qp_num ? 15 : 0;
 	if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
 		sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
 	sqp->ud_header.bth.solicited_event = !!(wr->wr.send_flags & IB_SEND_SOLICITED);
-	if (!sqp->qp.ibqp.qp_num)
-		ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port,
-				   sqp->pkey_index, &pkey);
+	if (!qp->ibqp.qp_num)
+		ib_get_cached_pkey(&dev->ib_dev, qp->port, sqp->pkey_index,
+				   &pkey);
 	else
-		ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port,
-				   wr->pkey_index, &pkey);
+		ib_get_cached_pkey(&dev->ib_dev, qp->port, wr->pkey_index,
+				   &pkey);
 	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
 	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->remote_qpn);
 	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
 	sqp->ud_header.deth.qkey = cpu_to_be32(wr->remote_qkey & 0x80000000 ?
 					       sqp->qkey : wr->remote_qkey);
-	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(qp->ibqp.qp_num);
 
 	header_size = ib_ud_header_pack(&sqp->ud_header,
 					sqp->header_buf +
 					ind * MTHCA_UD_HEADER_SIZE);
 
 	data->byte_count = cpu_to_be32(header_size);
-	data->lkey       = cpu_to_be32(to_mpd(sqp->qp.ibqp.pd)->ntmr.ibmr.lkey);
+	data->lkey       = cpu_to_be32(to_mpd(qp->ibqp.pd)->ntmr.ibmr.lkey);
 	data->addr       = cpu_to_be64(sqp->header_dma +
 				       ind * MTHCA_UD_HEADER_SIZE);
 
@@ -1639,8 +1638,8 @@
 	 * without initializing f0 and size0, and they are in fact
 	 * never used uninitialized.
 	 */
-	int uninitialized_var(size0);
-	u32 uninitialized_var(f0);
+	int size0;
+	u32 f0;
 	int ind;
 	u8 op0 = 0;
 
@@ -1735,9 +1734,9 @@
 			break;
 
 		case MLX:
-			err = build_mlx_header(dev, to_msqp(qp), ind, ud_wr(wr),
-					       wqe - sizeof (struct mthca_next_seg),
-					       wqe);
+			err = build_mlx_header(
+				dev, qp, ind, ud_wr(wr),
+				wqe - sizeof(struct mthca_next_seg), wqe);
 			if (err) {
 				*bad_wr = wr;
 				goto out;
@@ -1835,7 +1834,7 @@
 	 * without initializing size0, and it is in fact never used
 	 * uninitialized.
 	 */
-	int uninitialized_var(size0);
+	int size0;
 	int ind;
 	void *wqe;
 	void *prev_wqe;
@@ -1943,8 +1942,8 @@
 	 * without initializing f0 and size0, and they are in fact
 	 * never used uninitialized.
 	 */
-	int uninitialized_var(size0);
-	u32 uninitialized_var(f0);
+	int size0;
+	u32 f0;
 	int ind;
 	u8 op0 = 0;
 
@@ -2065,9 +2064,9 @@
 			break;
 
 		case MLX:
-			err = build_mlx_header(dev, to_msqp(qp), ind, ud_wr(wr),
-					       wqe - sizeof (struct mthca_next_seg),
-					       wqe);
+			err = build_mlx_header(
+				dev, qp, ind, ud_wr(wr),
+				wqe - sizeof(struct mthca_next_seg), wqe);
 			if (err) {
 				*bad_wr = wr;
 				goto out;
diff --git a/drivers/infiniband/hw/ocrdma/Kconfig b/drivers/infiniband/hw/ocrdma/Kconfig
index dd4ec38..54bd70b 100644
--- a/drivers/infiniband/hw/ocrdma/Kconfig
+++ b/drivers/infiniband/hw/ocrdma/Kconfig
@@ -4,6 +4,6 @@
 	depends on ETHERNET && NETDEVICES && PCI && INET && (IPV6 || IPV6=n)
 	select NET_VENDOR_EMULEX
 	select BE2NET
-	---help---
+	help
 	  This driver provides low-level InfiniBand over Ethernet
 	  support for Emulex One Connect host channel adapters (HCAs).
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h
index 7baedc7..5eb61c1 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma.h
@@ -98,7 +98,6 @@
 	u64 max_mr_size;
 	u32 max_num_mr_pbl;
 	int max_mw;
-	int max_fmr;
 	int max_map_per_fmr;
 	int max_pages_per_frmr;
 	u16 max_ord_per_qp;
@@ -186,7 +185,6 @@
 	u32 num_pbes;
 	u32 pbl_size;
 	u32 pbe_size;
-	u64 fbo;
 	u64 va;
 };
 
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
index 8d3e36d..699a8b7 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -155,7 +155,7 @@
 	return status;
 }
 
-int ocrdma_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr, u32 flags,
+int ocrdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
 		     struct ib_udata *udata)
 {
 	u32 *ahid_addr;
@@ -165,6 +165,7 @@
 	u16 vlan_tag = 0xffff;
 	const struct ib_gid_attr *sgid_attr;
 	struct ocrdma_pd *pd = get_ocrdma_pd(ibah->pd);
+	struct rdma_ah_attr *attr = init_attr->ah_attr;
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibah->device);
 
 	if ((attr->type != RDMA_AH_ATTR_TYPE_ROCE) ||
@@ -214,12 +215,13 @@
 	return status;
 }
 
-void ocrdma_destroy_ah(struct ib_ah *ibah, u32 flags)
+int ocrdma_destroy_ah(struct ib_ah *ibah, u32 flags)
 {
 	struct ocrdma_ah *ah = get_ocrdma_ah(ibah);
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibah->device);
 
 	ocrdma_free_av(dev, ah);
+	return 0;
 }
 
 int ocrdma_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
@@ -247,35 +249,20 @@
 	return 0;
 }
 
-int ocrdma_process_mad(struct ib_device *ibdev,
-		       int process_mad_flags,
-		       u8 port_num,
-		       const struct ib_wc *in_wc,
-		       const struct ib_grh *in_grh,
-		       const struct ib_mad_hdr *in, size_t in_mad_size,
-		       struct ib_mad_hdr *out, size_t *out_mad_size,
+int ocrdma_process_mad(struct ib_device *ibdev, int process_mad_flags,
+		       u8 port_num, const struct ib_wc *in_wc,
+		       const struct ib_grh *in_grh, const struct ib_mad *in,
+		       struct ib_mad *out, size_t *out_mad_size,
 		       u16 *out_mad_pkey_index)
 {
-	int status;
+	int status = IB_MAD_RESULT_SUCCESS;
 	struct ocrdma_dev *dev;
-	const struct ib_mad *in_mad = (const struct ib_mad *)in;
-	struct ib_mad *out_mad = (struct ib_mad *)out;
 
-	if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
-			 *out_mad_size != sizeof(*out_mad)))
-		return IB_MAD_RESULT_FAILURE;
-
-	switch (in_mad->mad_hdr.mgmt_class) {
-	case IB_MGMT_CLASS_PERF_MGMT:
+	if (in->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT) {
 		dev = get_ocrdma_dev(ibdev);
-		if (!ocrdma_pma_counters(dev, out_mad))
-			status = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
-		else
-			status = IB_MAD_RESULT_SUCCESS;
-		break;
-	default:
-		status = IB_MAD_RESULT_SUCCESS;
-		break;
+		ocrdma_pma_counters(dev, out);
+		status |= IB_MAD_RESULT_REPLY;
 	}
+
 	return status;
 }
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
index 64cb82c..35cf2e2 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
@@ -51,17 +51,14 @@
 	OCRDMA_AH_L3_TYPE_SHIFT		= 0x1D /* 29 bits */
 };
 
-int ocrdma_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
+int ocrdma_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr,
 		     struct ib_udata *udata);
-void ocrdma_destroy_ah(struct ib_ah *ah, u32 flags);
+int ocrdma_destroy_ah(struct ib_ah *ah, u32 flags);
 int ocrdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
 
-int ocrdma_process_mad(struct ib_device *,
-		       int process_mad_flags,
-		       u8 port_num,
-		       const struct ib_wc *in_wc,
-		       const struct ib_grh *in_grh,
-		       const struct ib_mad_hdr *in, size_t in_mad_size,
-		       struct ib_mad_hdr *out, size_t *out_mad_size,
+int ocrdma_process_mad(struct ib_device *dev, int process_mad_flags,
+		       u8 port_num, const struct ib_wc *in_wc,
+		       const struct ib_grh *in_grh, const struct ib_mad *in,
+		       struct ib_mad *out, size_t *out_mad_size,
 		       u16 *out_mad_pkey_index);
 #endif				/* __OCRDMA_AH_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
index d82d3ec..c51c3f4 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
@@ -1190,7 +1190,6 @@
 	attr->max_mr = rsp->max_mr;
 	attr->max_mr_size = ((u64)rsp->max_mr_size_hi << 32) |
 			      rsp->max_mr_size_lo;
-	attr->max_fmr = 0;
 	attr->max_pages_per_frmr = rsp->max_pages_per_frmr;
 	attr->max_num_mr_pbl = rsp->max_num_mr_pbl;
 	attr->max_cqe = rsp->max_cq_cqes_per_cq &
@@ -1963,6 +1962,7 @@
 	int i;
 	struct ocrdma_reg_nsmr *cmd;
 	struct ocrdma_reg_nsmr_rsp *rsp;
+	u64 fbo = hwmr->va & (hwmr->pbe_size - 1);
 
 	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_REGISTER_NSMR, sizeof(*cmd));
 	if (!cmd)
@@ -1988,8 +1988,8 @@
 					OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT;
 	cmd->totlen_low = hwmr->len;
 	cmd->totlen_high = upper_32_bits(hwmr->len);
-	cmd->fbo_low = (u32) (hwmr->fbo & 0xffffffff);
-	cmd->fbo_high = (u32) upper_32_bits(hwmr->fbo);
+	cmd->fbo_low = lower_32_bits(fbo);
+	cmd->fbo_high = upper_32_bits(fbo);
 	cmd->va_loaddr = (u32) hwmr->va;
 	cmd->va_hiaddr = (u32) upper_32_bits(hwmr->va);
 
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
index c15cfc6..9b96661 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -166,7 +166,6 @@
 	.get_port_immutable = ocrdma_port_immutable,
 	.map_mr_sg = ocrdma_map_mr_sg,
 	.mmap = ocrdma_mmap,
-	.modify_port = ocrdma_modify_port,
 	.modify_qp = ocrdma_modify_qp,
 	.poll_cq = ocrdma_poll_cq,
 	.post_recv = ocrdma_post_recv,
@@ -256,7 +255,9 @@
 	if (ret)
 		return ret;
 
-	return ib_register_device(&dev->ibdev, "ocrdma%d");
+	dma_set_max_seg_size(&dev->nic_info.pdev->dev, UINT_MAX);
+	return ib_register_device(&dev->ibdev, "ocrdma%d",
+				  &dev->nic_info.pdev->dev);
 }
 
 static int ocrdma_alloc_resources(struct ocrdma_dev *dev)
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
index 6ef89c2..c2e0d0f 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
@@ -2034,7 +2034,7 @@
 };
 
 struct ocrdma_rx_qp_err_stats {
-	u32 nak_invalid_requst_errors;
+	u32 nak_invalid_request_errors;
 	u32 nak_remote_operation_errors;
 	u32 nak_count_remote_access_errors;
 	u32 local_length_errors;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
index a902942..5f831e3 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
@@ -423,8 +423,8 @@
 	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
 
 	pcur = stats;
-	pcur += ocrdma_add_stat(stats, pcur, "nak_invalid_requst_errors",
-			(u64)rx_qp_err_stats->nak_invalid_requst_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "nak_invalid_request_errors",
+			(u64)rx_qp_err_stats->nak_invalid_request_errors);
 	pcur += ocrdma_add_stat(stats, pcur, "nak_remote_operation_errors",
 			(u64)rx_qp_err_stats->nak_remote_operation_errors);
 	pcur += ocrdma_add_stat(stats, pcur, "nak_count_remote_access_errors",
@@ -670,12 +670,10 @@
 	return -EFAULT;
 }
 
-int ocrdma_pma_counters(struct ocrdma_dev *dev,
-			struct ib_mad *out_mad)
+void ocrdma_pma_counters(struct ocrdma_dev *dev, struct ib_mad *out_mad)
 {
 	struct ib_pma_portcounters *pma_cnt;
 
-	memset(out_mad->data, 0, sizeof out_mad->data);
 	pma_cnt = (void *)(out_mad->data + 40);
 	ocrdma_update_stats(dev);
 
@@ -683,7 +681,6 @@
 	pma_cnt->port_rcv_data     = cpu_to_be32(ocrdma_sysfs_rcv_data(dev));
 	pma_cnt->port_xmit_packets = cpu_to_be32(ocrdma_sysfs_xmit_pkts(dev));
 	pma_cnt->port_rcv_packets  = cpu_to_be32(ocrdma_sysfs_rcv_pkts(dev));
-	return 0;
 }
 
 static ssize_t ocrdma_dbgfs_ops_read(struct file *filp, char __user *buffer,
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.h b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h
index bba1fec..98feca2 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h
@@ -69,7 +69,6 @@
 void ocrdma_release_stats_resources(struct ocrdma_dev *dev);
 void ocrdma_rem_port_stats(struct ocrdma_dev *dev);
 void ocrdma_add_port_stats(struct ocrdma_dev *dev);
-int ocrdma_pma_counters(struct ocrdma_dev *dev,
-			struct ib_mad *out_mad);
+void ocrdma_pma_counters(struct ocrdma_dev *dev, struct ib_mad *out_mad);
 
 #endif	/* __OCRDMA_STATS_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
index 55bd887..81a5600 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -99,8 +99,6 @@
 	attr->max_mw = dev->attr.max_mw;
 	attr->max_pd = dev->attr.max_pd;
 	attr->atomic_cap = 0;
-	attr->max_fmr = 0;
-	attr->max_map_per_fmr = 0;
 	attr->max_qp_rd_atom =
 	    min(dev->attr.max_ord_per_qp, dev->attr.max_ird_per_qp);
 	attr->max_qp_init_rd_atom = dev->attr.max_ord_per_qp;
@@ -114,7 +112,7 @@
 }
 
 static inline void get_link_speed_and_width(struct ocrdma_dev *dev,
-					    u8 *ib_speed, u8 *ib_width)
+					    u16 *ib_speed, u8 *ib_width)
 {
 	int status;
 	u8 speed;
@@ -190,12 +188,6 @@
 	return 0;
 }
 
-int ocrdma_modify_port(struct ib_device *ibdev, u8 port, int mask,
-		       struct ib_port_modify *props)
-{
-	return 0;
-}
-
 static int ocrdma_add_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr,
 			   unsigned long len)
 {
@@ -672,7 +664,7 @@
 	return status;
 }
 
-void ocrdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+int ocrdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
 	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
@@ -690,10 +682,11 @@
 
 		if (is_ucontext_pd(uctx, pd)) {
 			ocrdma_release_ucontext_pd(uctx);
-			return;
+			return 0;
 		}
 	}
 	_ocrdma_dealloc_pd(dev, pd);
+	return 0;
 }
 
 static int ocrdma_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
@@ -818,14 +811,12 @@
 	return status;
 }
 
-static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
-			    u32 num_pbes)
+static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr)
 {
 	struct ocrdma_pbe *pbe;
-	struct sg_dma_page_iter sg_iter;
+	struct ib_block_iter biter;
 	struct ocrdma_pbl *pbl_tbl = mr->hwmr.pbl_table;
-	struct ib_umem *umem = mr->umem;
-	int pbe_cnt, total_num_pbes = 0;
+	int pbe_cnt;
 	u64 pg_addr;
 
 	if (!mr->hwmr.num_pbes)
@@ -834,19 +825,14 @@
 	pbe = (struct ocrdma_pbe *)pbl_tbl->va;
 	pbe_cnt = 0;
 
-	for_each_sg_dma_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+	rdma_umem_for_each_dma_block (mr->umem, &biter, PAGE_SIZE) {
 		/* store the page address in pbe */
-		pg_addr = sg_page_iter_dma_address(&sg_iter);
+		pg_addr = rdma_block_iter_dma_address(&biter);
 		pbe->pa_lo = cpu_to_le32(pg_addr);
 		pbe->pa_hi = cpu_to_le32(upper_32_bits(pg_addr));
 		pbe_cnt += 1;
-		total_num_pbes += 1;
 		pbe++;
 
-		/* if done building pbes, issue the mbx cmd. */
-		if (total_num_pbes == num_pbes)
-			return;
-
 		/* if the given pbl is full storing the pbes,
 		 * move to next pbl.
 		 */
@@ -865,7 +851,6 @@
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
 	struct ocrdma_mr *mr;
 	struct ocrdma_pd *pd;
-	u32 num_pbes;
 
 	pd = get_ocrdma_pd(ibpd);
 
@@ -875,18 +860,17 @@
 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
 	if (!mr)
 		return ERR_PTR(status);
-	mr->umem = ib_umem_get(udata, start, len, acc, 0);
+	mr->umem = ib_umem_get(ibpd->device, start, len, acc);
 	if (IS_ERR(mr->umem)) {
 		status = -EFAULT;
 		goto umem_err;
 	}
-	num_pbes = ib_umem_page_count(mr->umem);
-	status = ocrdma_get_pbl_info(dev, mr, num_pbes);
+	status = ocrdma_get_pbl_info(
+		dev, mr, ib_umem_num_dma_blocks(mr->umem, PAGE_SIZE));
 	if (status)
 		goto umem_err;
 
 	mr->hwmr.pbe_size = PAGE_SIZE;
-	mr->hwmr.fbo = ib_umem_offset(mr->umem);
 	mr->hwmr.va = usr_addr;
 	mr->hwmr.len = len;
 	mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
@@ -897,7 +881,7 @@
 	status = ocrdma_build_pbl_tbl(dev, &mr->hwmr);
 	if (status)
 		goto umem_err;
-	build_user_pbes(dev, mr, num_pbes);
+	build_user_pbes(dev, mr);
 	status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc);
 	if (status)
 		goto mbx_err;
@@ -1064,7 +1048,7 @@
 	spin_unlock_irqrestore(&cq->cq_lock, flags);
 }
 
-void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
 	struct ocrdma_eq *eq = NULL;
@@ -1089,6 +1073,7 @@
 				ocrdma_get_db_addr(dev, pdid),
 				dev->nic_info.db_page_size);
 	}
+	return 0;
 }
 
 static int ocrdma_add_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp)
@@ -1117,7 +1102,7 @@
 	    (attrs->qp_type != IB_QPT_UD)) {
 		pr_err("%s(%d) unsupported qp type=0x%x requested\n",
 		       __func__, dev->id, attrs->qp_type);
-		return -EINVAL;
+		return -EOPNOTSUPP;
 	}
 	/* Skip the check for QP1 to support CM size of 128 */
 	if ((attrs->qp_type != IB_QPT_GSI) &&
@@ -1865,7 +1850,7 @@
 	return status;
 }
 
-void ocrdma_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
+int ocrdma_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
 	struct ocrdma_srq *srq;
 	struct ocrdma_dev *dev = get_ocrdma_dev(ibsrq->device);
@@ -1880,6 +1865,7 @@
 
 	kfree(srq->idx_bit_fields);
 	kfree(srq->rqe_wr_id_tbl);
+	return 0;
 }
 
 /* unprivileged verbs and their support functions. */
@@ -2142,7 +2128,7 @@
 		case IB_WR_SEND_WITH_IMM:
 			hdr->cw |= (OCRDMA_FLAG_IMM << OCRDMA_WQE_FLAGS_SHIFT);
 			hdr->immdt = ntohl(wr->ex.imm_data);
-			/* fall through */
+			fallthrough;
 		case IB_WR_SEND:
 			hdr->cw |= (OCRDMA_SEND << OCRDMA_WQE_OPCODE_SHIFT);
 			ocrdma_build_send(qp, hdr, wr);
@@ -2156,7 +2142,7 @@
 		case IB_WR_RDMA_WRITE_WITH_IMM:
 			hdr->cw |= (OCRDMA_FLAG_IMM << OCRDMA_WQE_FLAGS_SHIFT);
 			hdr->immdt = ntohl(wr->ex.imm_data);
-			/* fall through */
+			fallthrough;
 		case IB_WR_RDMA_WRITE:
 			hdr->cw |= (OCRDMA_WRITE << OCRDMA_WQE_OPCODE_SHIFT);
 			status = ocrdma_build_write(qp, hdr, wr);
@@ -2909,7 +2895,7 @@
 }
 
 struct ib_mr *ocrdma_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
-			      u32 max_num_sg, struct ib_udata *udata)
+			      u32 max_num_sg)
 {
 	int status;
 	struct ocrdma_mr *mr;
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
index 32488da..425d554 100644
--- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -54,8 +54,6 @@
 int ocrdma_query_device(struct ib_device *, struct ib_device_attr *props,
 			struct ib_udata *uhw);
 int ocrdma_query_port(struct ib_device *, u8 port, struct ib_port_attr *props);
-int ocrdma_modify_port(struct ib_device *, u8 port, int mask,
-		       struct ib_port_modify *props);
 
 enum rdma_protocol_type
 ocrdma_query_protocol(struct ib_device *device, u8 port_num);
@@ -69,12 +67,12 @@
 int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
 
 int ocrdma_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
-void ocrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+int ocrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 
 int ocrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		     struct ib_udata *udata);
 int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
-void ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 
 struct ib_qp *ocrdma_create_qp(struct ib_pd *,
 			       struct ib_qp_init_attr *attrs,
@@ -94,7 +92,7 @@
 int ocrdma_modify_srq(struct ib_srq *, struct ib_srq_attr *,
 		      enum ib_srq_attr_mask, struct ib_udata *);
 int ocrdma_query_srq(struct ib_srq *, struct ib_srq_attr *);
-void ocrdma_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
+int ocrdma_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
 int ocrdma_post_srq_recv(struct ib_srq *, const struct ib_recv_wr *,
 			 const struct ib_recv_wr **bad_recv_wr);
 
@@ -103,7 +101,7 @@
 struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
 				 u64 virt, int acc, struct ib_udata *);
 struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			      u32 max_num_sg, struct ib_udata *udata);
+			      u32 max_num_sg);
 int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
 		     unsigned int *sg_offset);
 
diff --git a/drivers/infiniband/hw/qedr/Kconfig b/drivers/infiniband/hw/qedr/Kconfig
index 9c30325..9e31d13 100644
--- a/drivers/infiniband/hw/qedr/Kconfig
+++ b/drivers/infiniband/hw/qedr/Kconfig
@@ -6,6 +6,6 @@
 	select QED_LL2
 	select QED_OOO
 	select QED_RDMA
-	---help---
+	help
 	  This driver provides low-level InfiniBand over Ethernet
 	  support for QLogic QED host channel adapters (HCAs).
diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c
index 93040c9..9676416 100644
--- a/drivers/infiniband/hw/qedr/main.c
+++ b/drivers/infiniband/hw/qedr/main.c
@@ -110,7 +110,6 @@
 	if (err)
 		return err;
 
-	immutable->pkey_tbl_len = 1;
 	immutable->gid_tbl_len = 1;
 	immutable->core_cap_flags = RDMA_CORE_PORT_IWARP;
 	immutable->max_mad_size = 0;
@@ -178,7 +177,10 @@
 }
 
 static const struct ib_device_ops qedr_roce_dev_ops = {
+	.alloc_xrcd = qedr_alloc_xrcd,
+	.dealloc_xrcd = qedr_dealloc_xrcd,
 	.get_port_immutable = qedr_roce_port_immutable,
+	.query_pkey = qedr_query_pkey,
 };
 
 static void qedr_roce_register_device(struct qedr_dev *dev)
@@ -186,6 +188,10 @@
 	dev->ibdev.node_type = RDMA_NODE_IB_CA;
 
 	ib_set_device_ops(&dev->ibdev, &qedr_roce_dev_ops);
+
+	dev->ibdev.uverbs_cmd_mask |= QEDR_UVERBS(OPEN_XRCD) |
+		QEDR_UVERBS(CLOSE_XRCD) |
+		QEDR_UVERBS(CREATE_XSRQ);
 }
 
 static const struct ib_device_ops qedr_dev_ops = {
@@ -212,7 +218,7 @@
 	.get_link_layer = qedr_link_layer,
 	.map_mr_sg = qedr_map_mr_sg,
 	.mmap = qedr_mmap,
-	.modify_port = qedr_modify_port,
+	.mmap_free = qedr_mmap_free,
 	.modify_qp = qedr_modify_qp,
 	.modify_srq = qedr_modify_srq,
 	.poll_cq = qedr_poll_cq,
@@ -221,7 +227,6 @@
 	.post_srq_recv = qedr_post_srq_recv,
 	.process_mad = qedr_process_mad,
 	.query_device = qedr_query_device,
-	.query_pkey = qedr_query_pkey,
 	.query_port = qedr_query_port,
 	.query_qp = qedr_query_qp,
 	.query_srq = qedr_query_srq,
@@ -233,6 +238,7 @@
 	INIT_RDMA_OBJ_SIZE(ib_cq, qedr_cq, ibcq),
 	INIT_RDMA_OBJ_SIZE(ib_pd, qedr_pd, ibpd),
 	INIT_RDMA_OBJ_SIZE(ib_srq, qedr_srq, ibsrq),
+	INIT_RDMA_OBJ_SIZE(ib_xrcd, qedr_xrcd, ibxrcd),
 	INIT_RDMA_OBJ_SIZE(ib_ucontext, qedr_ucontext, ibucontext),
 };
 
@@ -287,7 +293,8 @@
 	if (rc)
 		return rc;
 
-	return ib_register_device(&dev->ibdev, "qedr%d");
+	dma_set_max_seg_size(&dev->pdev->dev, UINT_MAX);
+	return ib_register_device(&dev->ibdev, "qedr%d", &dev->pdev->dev);
 }
 
 /* This function allocates fast-path status block memory */
@@ -346,9 +353,14 @@
 
 static int qedr_alloc_resources(struct qedr_dev *dev)
 {
+	struct qed_chain_init_params params = {
+		.mode		= QED_CHAIN_MODE_PBL,
+		.intended_use	= QED_CHAIN_USE_TO_CONSUME,
+		.cnt_type	= QED_CHAIN_CNT_TYPE_U16,
+		.elem_size	= sizeof(struct regpair *),
+	};
 	struct qedr_cnq *cnq;
 	__le16 *cons_pi;
-	u16 n_entries;
 	int i, rc;
 
 	dev->sgid_tbl = kcalloc(QEDR_MAX_SGID, sizeof(union ib_gid),
@@ -382,7 +394,9 @@
 	dev->sb_start = dev->ops->rdma_get_start_sb(dev->cdev);
 
 	/* Allocate CNQ PBLs */
-	n_entries = min_t(u32, QED_RDMA_MAX_CNQ_SIZE, QEDR_ROCE_MAX_CNQ_SIZE);
+	params.num_elems = min_t(u32, QED_RDMA_MAX_CNQ_SIZE,
+				 QEDR_ROCE_MAX_CNQ_SIZE);
+
 	for (i = 0; i < dev->num_cnq; i++) {
 		cnq = &dev->cnq_array[i];
 
@@ -391,13 +405,8 @@
 		if (rc)
 			goto err3;
 
-		rc = dev->ops->common->chain_alloc(dev->cdev,
-						   QED_CHAIN_USE_TO_CONSUME,
-						   QED_CHAIN_MODE_PBL,
-						   QED_CHAIN_CNT_TYPE_U16,
-						   n_entries,
-						   sizeof(struct regpair *),
-						   &cnq->pbl, NULL);
+		rc = dev->ops->common->chain_alloc(dev->cdev, &cnq->pbl,
+						   &params);
 		if (rc)
 			goto err4;
 
@@ -632,7 +641,6 @@
 	attr->max_mr_size = qed_attr->max_mr_size;
 	attr->max_cqe = min_t(u64, qed_attr->max_cqe, QEDR_MAX_CQES);
 	attr->max_mw = qed_attr->max_mw;
-	attr->max_fmr = qed_attr->max_fmr;
 	attr->max_mr_mw_fmr_pbl = qed_attr->max_mr_mw_fmr_pbl;
 	attr->max_mr_mw_fmr_size = qed_attr->max_mr_mw_fmr_size;
 	attr->max_pd = qed_attr->max_pd;
@@ -705,6 +713,18 @@
 			event.event = IB_EVENT_SRQ_ERR;
 			event_type = EVENT_TYPE_SRQ;
 			break;
+		case ROCE_ASYNC_EVENT_XRC_DOMAIN_ERR:
+			event.event = IB_EVENT_QP_ACCESS_ERR;
+			event_type = EVENT_TYPE_QP;
+			break;
+		case ROCE_ASYNC_EVENT_INVALID_XRCETH_ERR:
+			event.event = IB_EVENT_QP_ACCESS_ERR;
+			event_type = EVENT_TYPE_QP;
+			break;
+		case ROCE_ASYNC_EVENT_XRC_SRQ_CATASTROPHIC_ERR:
+			event.event = IB_EVENT_CQ_ERR;
+			event_type = EVENT_TYPE_CQ;
+			break;
 		default:
 			DP_ERR(dev, "unsupported event %d on handle=%llx\n",
 			       e_code, roce_handle64);
@@ -1026,6 +1046,13 @@
 	case QEDE_CHANGE_ADDR:
 		qedr_mac_address_change(dev);
 		break;
+	case QEDE_CHANGE_MTU:
+		if (rdma_protocol_iwarp(&dev->ibdev, 1))
+			if (dev->ndev->mtu != dev->iwarp_max_mtu)
+				DP_NOTICE(dev,
+					  "Mtu was changed from %d to %d. This will not take affect for iWARP until qedr is reloaded\n",
+					  dev->iwarp_max_mtu, dev->ndev->mtu);
+		break;
 	default:
 		pr_err("Event not supported\n");
 	}
diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h
index ed56df3..9dde703 100644
--- a/drivers/infiniband/hw/qedr/qedr.h
+++ b/drivers/infiniband/hw/qedr/qedr.h
@@ -103,7 +103,6 @@
 	u64	max_mr_size;
 	u32	max_cqe;
 	u32	max_mw;
-	u32	max_fmr;
 	u32	max_mr_mw_fmr_pbl;
 	u64	max_mr_mw_fmr_size;
 	u32	max_pd;
@@ -231,14 +230,17 @@
 	struct qedr_dev *dev;
 	struct qedr_pd *pd;
 	void __iomem *dpi_addr;
+	struct rdma_user_mmap_entry *db_mmap_entry;
 	u64 dpi_phys_addr;
 	u32 dpi_size;
 	u16 dpi;
+	bool db_rec;
+	u8 edpm_mode;
+};
 
-	struct list_head mm_head;
-
-	/* Lock to protect mm list */
-	struct mutex mm_list_lock;
+union db_prod32 {
+	struct rdma_pwm_val16_data data;
+	u32 raw;
 };
 
 union db_prod64 {
@@ -266,6 +268,13 @@
 	struct qedr_pbl *pbl_tbl;
 	u64 buf_addr;
 	size_t buf_len;
+
+	/* doorbell recovery */
+	void __iomem *db_addr;
+	struct qedr_user_db_rec *db_rec_data;
+	struct rdma_user_mmap_entry *db_mmap_entry;
+	void __iomem *db_rec_db2_addr;
+	union db_prod32 db_rec_db2_data;
 };
 
 struct qedr_cq {
@@ -301,17 +310,9 @@
 	struct qedr_ucontext *uctx;
 };
 
-struct qedr_mm {
-	struct {
-		u64 phy_addr;
-		unsigned long len;
-	} key;
-	struct list_head entry;
-};
-
-union db_prod32 {
-	struct rdma_pwm_val16_data data;
-	u32 raw;
+struct qedr_xrcd {
+	struct ib_xrcd ibxrcd;
+	u16 xrcd_id;
 };
 
 struct qedr_qp_hwq_info {
@@ -365,6 +366,7 @@
 	struct ib_umem *prod_umem;
 	u16 srq_id;
 	u32 srq_limit;
+	bool is_xrc;
 	/* lock to protect srq recv post */
 	spinlock_t lock;
 };
@@ -491,6 +493,18 @@
 	u32 npages;
 };
 
+struct qedr_user_mmap_entry {
+	struct rdma_user_mmap_entry rdma_entry;
+	struct qedr_dev *dev;
+	union {
+		u64 io_address;
+		void *address;
+	};
+	size_t length;
+	u16 dpi;
+	u8 mmap_flag;
+};
+
 #define SET_FIELD2(value, name, flag) ((value) |= ((flag) << (name ## _SHIFT)))
 
 #define QEDR_RESP_IMM	(RDMA_CQE_RESPONDER_IMM_FLG_MASK << \
@@ -565,6 +579,11 @@
 	return container_of(ibpd, struct qedr_pd, ibpd);
 }
 
+static inline struct qedr_xrcd *get_qedr_xrcd(struct ib_xrcd *ibxrcd)
+{
+	return container_of(ibxrcd, struct qedr_xrcd, ibxrcd);
+}
+
 static inline struct qedr_cq *get_qedr_cq(struct ib_cq *ibcq)
 {
 	return container_of(ibcq, struct qedr_cq, ibcq);
@@ -589,4 +608,33 @@
 {
 	return container_of(ibsrq, struct qedr_srq, ibsrq);
 }
+
+static inline bool qedr_qp_has_srq(struct qedr_qp *qp)
+{
+	return qp->srq;
+}
+
+static inline bool qedr_qp_has_sq(struct qedr_qp *qp)
+{
+	if (qp->qp_type == IB_QPT_GSI || qp->qp_type == IB_QPT_XRC_TGT)
+		return 0;
+
+	return 1;
+}
+
+static inline bool qedr_qp_has_rq(struct qedr_qp *qp)
+{
+	if (qp->qp_type == IB_QPT_GSI || qp->qp_type == IB_QPT_XRC_INI ||
+	    qp->qp_type == IB_QPT_XRC_TGT || qedr_qp_has_srq(qp))
+		return 0;
+
+	return 1;
+}
+
+static inline struct qedr_user_mmap_entry *
+get_qedr_mmap_entry(struct rdma_user_mmap_entry *rdma_entry)
+{
+	return container_of(rdma_entry, struct qedr_user_mmap_entry,
+			    rdma_entry);
+}
 #endif
diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
index b9b46a0..1715fbe 100644
--- a/drivers/infiniband/hw/qedr/qedr_iw_cm.c
+++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c
@@ -515,7 +515,7 @@
 	return rc;
 }
 
-struct qedr_qp *qedr_iw_load_qp(struct qedr_dev *dev, u32 qpn)
+static struct qedr_qp *qedr_iw_load_qp(struct qedr_dev *dev, u32 qpn)
 {
 	struct qedr_qp *qp;
 
diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c
index 4408d33..eeb87f3 100644
--- a/drivers/infiniband/hw/qedr/verbs.c
+++ b/drivers/infiniband/hw/qedr/verbs.c
@@ -59,6 +59,11 @@
 
 #define DB_ADDR_SHIFT(addr)		((addr) << DB_PWM_ADDR_OFFSET_SHIFT)
 
+enum {
+	QEDR_USER_MMAP_IO_WC = 0,
+	QEDR_USER_MMAP_PHYS_PAGE,
+};
+
 static inline int qedr_ib_copy_to_udata(struct ib_udata *udata, void *src,
 					size_t len)
 {
@@ -131,6 +136,8 @@
 	    IB_DEVICE_RC_RNR_NAK_GEN |
 	    IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_MGT_EXTENSIONS;
 
+	if (!rdma_protocol_iwarp(&dev->ibdev, 1))
+		attr->device_cap_flags |= IB_DEVICE_XRC;
 	attr->max_send_sge = qattr->max_sge;
 	attr->max_recv_sge = qattr->max_sge;
 	attr->max_sge_rd = qattr->max_sge;
@@ -140,8 +147,6 @@
 	attr->max_mw = qattr->max_mw;
 	attr->max_pd = qattr->max_pd;
 	attr->atomic_cap = dev->atomic_cap;
-	attr->max_fmr = qattr->max_fmr;
-	attr->max_map_per_fmr = 16;
 	attr->max_qp_init_rd_atom =
 	    1 << (fls(qattr->max_qp_req_rd_atomic_resc) - 1);
 	attr->max_qp_rd_atom =
@@ -154,13 +159,13 @@
 
 	attr->local_ca_ack_delay = qattr->dev_ack_delay;
 	attr->max_fast_reg_page_list_len = qattr->max_mr / 8;
-	attr->max_pkeys = QEDR_ROCE_PKEY_MAX;
+	attr->max_pkeys = qattr->max_pkey;
 	attr->max_ah = qattr->max_ah;
 
 	return 0;
 }
 
-static inline void get_link_speed_and_width(int speed, u8 *ib_speed,
+static inline void get_link_speed_and_width(int speed, u16 *ib_speed,
 					    u8 *ib_width)
 {
 	switch (speed) {
@@ -228,16 +233,16 @@
 		attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
 	}
 	attr->max_mtu = IB_MTU_4096;
-	attr->active_mtu = iboe_get_mtu(dev->ndev->mtu);
 	attr->lid = 0;
 	attr->lmc = 0;
 	attr->sm_lid = 0;
 	attr->sm_sl = 0;
 	attr->ip_gids = true;
 	if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
+		attr->active_mtu = iboe_get_mtu(dev->iwarp_max_mtu);
 		attr->gid_tbl_len = 1;
-		attr->pkey_tbl_len = 1;
 	} else {
+		attr->active_mtu = iboe_get_mtu(dev->ndev->mtu);
 		attr->gid_tbl_len = QEDR_MAX_SGID;
 		attr->pkey_tbl_len = QEDR_ROCE_PKEY_TABLE_LEN;
 	}
@@ -251,78 +256,32 @@
 	return 0;
 }
 
-int qedr_modify_port(struct ib_device *ibdev, u8 port, int mask,
-		     struct ib_port_modify *props)
-{
-	return 0;
-}
-
-static int qedr_add_mmap(struct qedr_ucontext *uctx, u64 phy_addr,
-			 unsigned long len)
-{
-	struct qedr_mm *mm;
-
-	mm = kzalloc(sizeof(*mm), GFP_KERNEL);
-	if (!mm)
-		return -ENOMEM;
-
-	mm->key.phy_addr = phy_addr;
-	/* This function might be called with a length which is not a multiple
-	 * of PAGE_SIZE, while the mapping is PAGE_SIZE grained and the kernel
-	 * forces this granularity by increasing the requested size if needed.
-	 * When qedr_mmap is called, it will search the list with the updated
-	 * length as a key. To prevent search failures, the length is rounded up
-	 * in advance to PAGE_SIZE.
-	 */
-	mm->key.len = roundup(len, PAGE_SIZE);
-	INIT_LIST_HEAD(&mm->entry);
-
-	mutex_lock(&uctx->mm_list_lock);
-	list_add(&mm->entry, &uctx->mm_head);
-	mutex_unlock(&uctx->mm_list_lock);
-
-	DP_DEBUG(uctx->dev, QEDR_MSG_MISC,
-		 "added (addr=0x%llx,len=0x%lx) for ctx=%p\n",
-		 (unsigned long long)mm->key.phy_addr,
-		 (unsigned long)mm->key.len, uctx);
-
-	return 0;
-}
-
-static bool qedr_search_mmap(struct qedr_ucontext *uctx, u64 phy_addr,
-			     unsigned long len)
-{
-	bool found = false;
-	struct qedr_mm *mm;
-
-	mutex_lock(&uctx->mm_list_lock);
-	list_for_each_entry(mm, &uctx->mm_head, entry) {
-		if (len != mm->key.len || phy_addr != mm->key.phy_addr)
-			continue;
-
-		found = true;
-		break;
-	}
-	mutex_unlock(&uctx->mm_list_lock);
-	DP_DEBUG(uctx->dev, QEDR_MSG_MISC,
-		 "searched for (addr=0x%llx,len=0x%lx) for ctx=%p, result=%d\n",
-		 mm->key.phy_addr, mm->key.len, uctx, found);
-
-	return found;
-}
-
 int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
 {
 	struct ib_device *ibdev = uctx->device;
 	int rc;
 	struct qedr_ucontext *ctx = get_qedr_ucontext(uctx);
 	struct qedr_alloc_ucontext_resp uresp = {};
+	struct qedr_alloc_ucontext_req ureq = {};
 	struct qedr_dev *dev = get_qedr_dev(ibdev);
 	struct qed_rdma_add_user_out_params oparams;
+	struct qedr_user_mmap_entry *entry;
 
 	if (!udata)
 		return -EFAULT;
 
+	if (udata->inlen) {
+		rc = ib_copy_from_udata(&ureq, udata,
+					min(sizeof(ureq), udata->inlen));
+		if (rc) {
+			DP_ERR(dev, "Problem copying data from user space\n");
+			return -EFAULT;
+		}
+		ctx->edpm_mode = !!(ureq.context_flags &
+				    QEDR_ALLOC_UCTX_EDPM_MODE);
+		ctx->db_rec = !!(ureq.context_flags & QEDR_ALLOC_UCTX_DB_REC);
+	}
+
 	rc = dev->ops->rdma_add_user(dev->rdma_ctx, &oparams);
 	if (rc) {
 		DP_ERR(dev,
@@ -335,13 +294,44 @@
 	ctx->dpi_addr = oparams.dpi_addr;
 	ctx->dpi_phys_addr = oparams.dpi_phys_addr;
 	ctx->dpi_size = oparams.dpi_size;
-	INIT_LIST_HEAD(&ctx->mm_head);
-	mutex_init(&ctx->mm_list_lock);
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry) {
+		rc = -ENOMEM;
+		goto err;
+	}
 
-	uresp.dpm_enabled = dev->user_dpm_enabled;
+	entry->io_address = ctx->dpi_phys_addr;
+	entry->length = ctx->dpi_size;
+	entry->mmap_flag = QEDR_USER_MMAP_IO_WC;
+	entry->dpi = ctx->dpi;
+	entry->dev = dev;
+	rc = rdma_user_mmap_entry_insert(uctx, &entry->rdma_entry,
+					 ctx->dpi_size);
+	if (rc) {
+		kfree(entry);
+		goto err;
+	}
+	ctx->db_mmap_entry = &entry->rdma_entry;
+
+	if (!dev->user_dpm_enabled)
+		uresp.dpm_flags = 0;
+	else if (rdma_protocol_iwarp(&dev->ibdev, 1))
+		uresp.dpm_flags = QEDR_DPM_TYPE_IWARP_LEGACY;
+	else
+		uresp.dpm_flags = QEDR_DPM_TYPE_ROCE_ENHANCED |
+				  QEDR_DPM_TYPE_ROCE_LEGACY |
+				  QEDR_DPM_TYPE_ROCE_EDPM_MODE;
+
+	if (ureq.context_flags & QEDR_SUPPORT_DPM_SIZES) {
+		uresp.dpm_flags |= QEDR_DPM_SIZES_SET;
+		uresp.ldpm_limit_size = QEDR_LDPM_MAX_SIZE;
+		uresp.edpm_trans_size = QEDR_EDPM_TRANS_SIZE;
+		uresp.edpm_limit_size = QEDR_EDPM_MAX_SIZE;
+	}
+
 	uresp.wids_enabled = 1;
 	uresp.wid_count = oparams.wid_count;
-	uresp.db_pa = ctx->dpi_phys_addr;
+	uresp.db_pa = rdma_user_mmap_get_offset(ctx->db_mmap_entry);
 	uresp.db_size = ctx->dpi_size;
 	uresp.max_send_wr = dev->attr.max_sqe;
 	uresp.max_recv_wr = dev->attr.max_rqe;
@@ -353,82 +343,92 @@
 
 	rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 	if (rc)
-		return rc;
+		goto err;
 
 	ctx->dev = dev;
 
-	rc = qedr_add_mmap(ctx, ctx->dpi_phys_addr, ctx->dpi_size);
-	if (rc)
-		return rc;
-
 	DP_DEBUG(dev, QEDR_MSG_INIT, "Allocating user context %p\n",
 		 &ctx->ibucontext);
 	return 0;
+
+err:
+	if (!ctx->db_mmap_entry)
+		dev->ops->rdma_remove_user(dev->rdma_ctx, ctx->dpi);
+	else
+		rdma_user_mmap_entry_remove(ctx->db_mmap_entry);
+
+	return rc;
 }
 
 void qedr_dealloc_ucontext(struct ib_ucontext *ibctx)
 {
 	struct qedr_ucontext *uctx = get_qedr_ucontext(ibctx);
-	struct qedr_mm *mm, *tmp;
 
 	DP_DEBUG(uctx->dev, QEDR_MSG_INIT, "Deallocating user context %p\n",
 		 uctx);
-	uctx->dev->ops->rdma_remove_user(uctx->dev->rdma_ctx, uctx->dpi);
 
-	list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) {
-		DP_DEBUG(uctx->dev, QEDR_MSG_MISC,
-			 "deleted (addr=0x%llx,len=0x%lx) for ctx=%p\n",
-			 mm->key.phy_addr, mm->key.len, uctx);
-		list_del(&mm->entry);
-		kfree(mm);
-	}
+	rdma_user_mmap_entry_remove(uctx->db_mmap_entry);
 }
 
-int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+void qedr_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
 {
-	struct qedr_ucontext *ucontext = get_qedr_ucontext(context);
-	struct qedr_dev *dev = get_qedr_dev(context->device);
-	unsigned long phys_addr = vma->vm_pgoff << PAGE_SHIFT;
-	unsigned long len = (vma->vm_end - vma->vm_start);
-	unsigned long dpi_start;
+	struct qedr_user_mmap_entry *entry = get_qedr_mmap_entry(rdma_entry);
+	struct qedr_dev *dev = entry->dev;
 
-	dpi_start = dev->db_phys_addr + (ucontext->dpi * ucontext->dpi_size);
+	if (entry->mmap_flag == QEDR_USER_MMAP_PHYS_PAGE)
+		free_page((unsigned long)entry->address);
+	else if (entry->mmap_flag == QEDR_USER_MMAP_IO_WC)
+		dev->ops->rdma_remove_user(dev->rdma_ctx, entry->dpi);
 
-	DP_DEBUG(dev, QEDR_MSG_INIT,
-		 "mmap invoked with vm_start=0x%pK, vm_end=0x%pK,vm_pgoff=0x%pK; dpi_start=0x%pK dpi_size=0x%x\n",
-		 (void *)vma->vm_start, (void *)vma->vm_end,
-		 (void *)vma->vm_pgoff, (void *)dpi_start, ucontext->dpi_size);
+	kfree(entry);
+}
 
-	if ((vma->vm_start & (PAGE_SIZE - 1)) || (len & (PAGE_SIZE - 1))) {
-		DP_ERR(dev,
-		       "failed mmap, addresses must be page aligned: start=0x%pK, end=0x%pK\n",
-		       (void *)vma->vm_start, (void *)vma->vm_end);
+int qedr_mmap(struct ib_ucontext *ucontext, struct vm_area_struct *vma)
+{
+	struct ib_device *dev = ucontext->device;
+	size_t length = vma->vm_end - vma->vm_start;
+	struct rdma_user_mmap_entry *rdma_entry;
+	struct qedr_user_mmap_entry *entry;
+	int rc = 0;
+	u64 pfn;
+
+	ibdev_dbg(dev,
+		  "start %#lx, end %#lx, length = %#zx, pgoff = %#lx\n",
+		  vma->vm_start, vma->vm_end, length, vma->vm_pgoff);
+
+	rdma_entry = rdma_user_mmap_entry_get(ucontext, vma);
+	if (!rdma_entry) {
+		ibdev_dbg(dev, "pgoff[%#lx] does not have valid entry\n",
+			  vma->vm_pgoff);
 		return -EINVAL;
 	}
+	entry = get_qedr_mmap_entry(rdma_entry);
+	ibdev_dbg(dev,
+		  "Mapping address[%#llx], length[%#zx], mmap_flag[%d]\n",
+		  entry->io_address, length, entry->mmap_flag);
 
-	if (!qedr_search_mmap(ucontext, phys_addr, len)) {
-		DP_ERR(dev, "failed mmap, vm_pgoff=0x%lx is not authorized\n",
-		       vma->vm_pgoff);
-		return -EINVAL;
+	switch (entry->mmap_flag) {
+	case QEDR_USER_MMAP_IO_WC:
+		pfn = entry->io_address >> PAGE_SHIFT;
+		rc = rdma_user_mmap_io(ucontext, vma, pfn, length,
+				       pgprot_writecombine(vma->vm_page_prot),
+				       rdma_entry);
+		break;
+	case QEDR_USER_MMAP_PHYS_PAGE:
+		rc = vm_insert_page(vma, vma->vm_start,
+				    virt_to_page(entry->address));
+		break;
+	default:
+		rc = -EINVAL;
 	}
 
-	if (phys_addr < dpi_start ||
-	    ((phys_addr + len) > (dpi_start + ucontext->dpi_size))) {
-		DP_ERR(dev,
-		       "failed mmap, pages are outside of dpi; page address=0x%pK, dpi_start=0x%pK, dpi_size=0x%x\n",
-		       (void *)phys_addr, (void *)dpi_start,
-		       ucontext->dpi_size);
-		return -EINVAL;
-	}
+	if (rc)
+		ibdev_dbg(dev,
+			  "Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n",
+			  entry->io_address, length, entry->mmap_flag, rc);
 
-	if (vma->vm_flags & VM_READ) {
-		DP_ERR(dev, "failed mmap, cannot map doorbell bar for read\n");
-		return -EINVAL;
-	}
-
-	vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
-	return io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, len,
-				  vma->vm_page_prot);
+	rdma_user_mmap_entry_put(rdma_entry);
+	return rc;
 }
 
 int qedr_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
@@ -474,15 +474,33 @@
 	return 0;
 }
 
-void qedr_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+int qedr_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
 	struct qedr_dev *dev = get_qedr_dev(ibpd->device);
 	struct qedr_pd *pd = get_qedr_pd(ibpd);
 
 	DP_DEBUG(dev, QEDR_MSG_INIT, "Deallocating PD %d\n", pd->pd_id);
 	dev->ops->rdma_dealloc_pd(dev->rdma_ctx, pd->pd_id);
+	return 0;
 }
 
+
+int qedr_alloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibxrcd->device);
+	struct qedr_xrcd *xrcd = get_qedr_xrcd(ibxrcd);
+
+	return dev->ops->rdma_alloc_xrcd(dev->rdma_ctx, &xrcd->xrcd_id);
+}
+
+int qedr_dealloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata)
+{
+	struct qedr_dev *dev = get_qedr_dev(ibxrcd->device);
+	u16 xrcd_id = get_qedr_xrcd(ibxrcd)->xrcd_id;
+
+	dev->ops->rdma_dealloc_xrcd(dev->rdma_ctx, xrcd_id);
+	return 0;
+}
 static void qedr_free_pbl(struct qedr_dev *dev,
 			  struct qedr_pbl_info *pbl_info, struct qedr_pbl *pbl)
 {
@@ -603,11 +621,9 @@
 			       struct qedr_pbl_info *pbl_info, u32 pg_shift)
 {
 	int pbe_cnt, total_num_pbes = 0;
-	u32 fw_pg_cnt, fw_pg_per_umem_pg;
 	struct qedr_pbl *pbl_tbl;
-	struct sg_dma_page_iter sg_iter;
+	struct ib_block_iter biter;
 	struct regpair *pbe;
-	u64 pg_addr;
 
 	if (!pbl_info->num_pbes)
 		return;
@@ -628,46 +644,73 @@
 
 	pbe_cnt = 0;
 
-	fw_pg_per_umem_pg = BIT(PAGE_SHIFT - pg_shift);
+	rdma_umem_for_each_dma_block (umem, &biter, BIT(pg_shift)) {
+		u64 pg_addr = rdma_block_iter_dma_address(&biter);
 
-	for_each_sg_dma_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
-		pg_addr = sg_page_iter_dma_address(&sg_iter);
-		for (fw_pg_cnt = 0; fw_pg_cnt < fw_pg_per_umem_pg;) {
-			pbe->lo = cpu_to_le32(pg_addr);
-			pbe->hi = cpu_to_le32(upper_32_bits(pg_addr));
+		pbe->lo = cpu_to_le32(pg_addr);
+		pbe->hi = cpu_to_le32(upper_32_bits(pg_addr));
 
-			pg_addr += BIT(pg_shift);
-			pbe_cnt++;
-			total_num_pbes++;
-			pbe++;
+		pbe_cnt++;
+		total_num_pbes++;
+		pbe++;
 
-			if (total_num_pbes == pbl_info->num_pbes)
-				return;
+		if (total_num_pbes == pbl_info->num_pbes)
+			return;
 
-			/* If the given pbl is full storing the pbes,
-			 * move to next pbl.
-			 */
-			if (pbe_cnt == (pbl_info->pbl_size / sizeof(u64))) {
-				pbl_tbl++;
-				pbe = (struct regpair *)pbl_tbl->va;
-				pbe_cnt = 0;
-			}
-
-			fw_pg_cnt++;
+		/* If the given pbl is full storing the pbes, move to next pbl.
+		 */
+		if (pbe_cnt == (pbl_info->pbl_size / sizeof(u64))) {
+			pbl_tbl++;
+			pbe = (struct regpair *)pbl_tbl->va;
+			pbe_cnt = 0;
 		}
 	}
 }
 
+static int qedr_db_recovery_add(struct qedr_dev *dev,
+				void __iomem *db_addr,
+				void *db_data,
+				enum qed_db_rec_width db_width,
+				enum qed_db_rec_space db_space)
+{
+	if (!db_data) {
+		DP_DEBUG(dev, QEDR_MSG_INIT, "avoiding db rec since old lib\n");
+		return 0;
+	}
+
+	return dev->ops->common->db_recovery_add(dev->cdev, db_addr, db_data,
+						 db_width, db_space);
+}
+
+static void qedr_db_recovery_del(struct qedr_dev *dev,
+				 void __iomem *db_addr,
+				 void *db_data)
+{
+	if (!db_data) {
+		DP_DEBUG(dev, QEDR_MSG_INIT, "avoiding db rec since old lib\n");
+		return;
+	}
+
+	/* Ignore return code as there is not much we can do about it. Error
+	 * log will be printed inside.
+	 */
+	dev->ops->common->db_recovery_del(dev->cdev, db_addr, db_data);
+}
+
 static int qedr_copy_cq_uresp(struct qedr_dev *dev,
-			      struct qedr_cq *cq, struct ib_udata *udata)
+			      struct qedr_cq *cq, struct ib_udata *udata,
+			      u32 db_offset)
 {
 	struct qedr_create_cq_uresp uresp;
 	int rc;
 
 	memset(&uresp, 0, sizeof(uresp));
 
-	uresp.db_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT);
+	uresp.db_offset = db_offset;
 	uresp.icid = cq->icid;
+	if (cq->q.db_mmap_entry)
+		uresp.db_rec_addr =
+			rdma_user_mmap_get_offset(cq->q.db_mmap_entry);
 
 	rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
 	if (rc)
@@ -695,10 +738,58 @@
 	return aligned_size / QEDR_CQE_SIZE;
 }
 
+static int qedr_init_user_db_rec(struct ib_udata *udata,
+				 struct qedr_dev *dev, struct qedr_userq *q,
+				 bool requires_db_rec)
+{
+	struct qedr_ucontext *uctx =
+		rdma_udata_to_drv_context(udata, struct qedr_ucontext,
+					  ibucontext);
+	struct qedr_user_mmap_entry *entry;
+	int rc;
+
+	/* Aborting for non doorbell userqueue (SRQ) or non-supporting lib */
+	if (requires_db_rec == 0 || !uctx->db_rec)
+		return 0;
+
+	/* Allocate a page for doorbell recovery, add to mmap */
+	q->db_rec_data = (void *)get_zeroed_page(GFP_USER);
+	if (!q->db_rec_data) {
+		DP_ERR(dev, "get_zeroed_page failed\n");
+		return -ENOMEM;
+	}
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		goto err_free_db_data;
+
+	entry->address = q->db_rec_data;
+	entry->length = PAGE_SIZE;
+	entry->mmap_flag = QEDR_USER_MMAP_PHYS_PAGE;
+	rc = rdma_user_mmap_entry_insert(&uctx->ibucontext,
+					 &entry->rdma_entry,
+					 PAGE_SIZE);
+	if (rc)
+		goto err_free_entry;
+
+	q->db_mmap_entry = &entry->rdma_entry;
+
+	return 0;
+
+err_free_entry:
+	kfree(entry);
+
+err_free_db_data:
+	free_page((unsigned long)q->db_rec_data);
+	q->db_rec_data = NULL;
+	return -ENOMEM;
+}
+
 static inline int qedr_init_user_queue(struct ib_udata *udata,
 				       struct qedr_dev *dev,
 				       struct qedr_userq *q, u64 buf_addr,
-				       size_t buf_len, int access, int dmasync,
+				       size_t buf_len, bool requires_db_rec,
+				       int access,
 				       int alloc_and_init)
 {
 	u32 fw_pages;
@@ -706,16 +797,14 @@
 
 	q->buf_addr = buf_addr;
 	q->buf_len = buf_len;
-	q->umem = ib_umem_get(udata, q->buf_addr, q->buf_len, access, dmasync);
+	q->umem = ib_umem_get(&dev->ibdev, q->buf_addr, q->buf_len, access);
 	if (IS_ERR(q->umem)) {
 		DP_ERR(dev, "create user queue: failed ib_umem_get, got %ld\n",
 		       PTR_ERR(q->umem));
 		return PTR_ERR(q->umem);
 	}
 
-	fw_pages = ib_umem_page_count(q->umem) <<
-	    (PAGE_SHIFT - FW_PAGE_SHIFT);
-
+	fw_pages = ib_umem_num_dma_blocks(q->umem, 1 << FW_PAGE_SHIFT);
 	rc = qedr_prepare_pbl_tbl(dev, &q->pbl_info, fw_pages, 0);
 	if (rc)
 		goto err0;
@@ -736,7 +825,8 @@
 		}
 	}
 
-	return 0;
+	/* mmap the user address used to store doorbell data for recovery */
+	return qedr_init_user_db_rec(udata, dev, q, requires_db_rec);
 
 err0:
 	ib_umem_release(q->umem);
@@ -815,6 +905,12 @@
 		udata, struct qedr_ucontext, ibucontext);
 	struct qed_rdma_destroy_cq_out_params destroy_oparams;
 	struct qed_rdma_destroy_cq_in_params destroy_iparams;
+	struct qed_chain_init_params chain_params = {
+		.mode		= QED_CHAIN_MODE_PBL,
+		.intended_use	= QED_CHAIN_USE_TO_CONSUME,
+		.cnt_type	= QED_CHAIN_CNT_TYPE_U32,
+		.elem_size	= sizeof(union rdma_cqe),
+	};
 	struct qedr_dev *dev = get_qedr_dev(ibdev);
 	struct qed_rdma_create_cq_in_params params;
 	struct qedr_create_cq_ureq ureq = {};
@@ -822,6 +918,7 @@
 	int entries = attr->cqe;
 	struct qedr_cq *cq = get_qedr_cq(ibcq);
 	int chain_entries;
+	u32 db_offset;
 	int page_cnt;
 	u64 pbl_ptr;
 	u16 icid;
@@ -840,9 +937,14 @@
 
 	chain_entries = qedr_align_cq_entries(entries);
 	chain_entries = min_t(int, chain_entries, QEDR_MAX_CQES);
+	chain_params.num_elems = chain_entries;
+
+	/* calc db offset. user will add DPI base, kernel will add db addr */
+	db_offset = DB_ADDR_SHIFT(DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT);
 
 	if (udata) {
-		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) {
+		if (ib_copy_from_udata(&ureq, udata, min(sizeof(ureq),
+							 udata->inlen))) {
 			DP_ERR(dev,
 			       "create cq: problem copying data from user space\n");
 			goto err0;
@@ -857,7 +959,7 @@
 		cq->cq_type = QEDR_CQ_TYPE_USER;
 
 		rc = qedr_init_user_queue(udata, dev, &cq->q, ureq.addr,
-					  ureq.len, IB_ACCESS_LOCAL_WRITE, 1,
+					  ureq.len, true, IB_ACCESS_LOCAL_WRITE,
 					  1);
 		if (rc)
 			goto err0;
@@ -866,18 +968,14 @@
 		page_cnt = cq->q.pbl_info.num_pbes;
 
 		cq->ibcq.cqe = chain_entries;
+		cq->q.db_addr = ctx->dpi_addr + db_offset;
 	} else {
 		cq->cq_type = QEDR_CQ_TYPE_KERNEL;
 
-		rc = dev->ops->common->chain_alloc(dev->cdev,
-						   QED_CHAIN_USE_TO_CONSUME,
-						   QED_CHAIN_MODE_PBL,
-						   QED_CHAIN_CNT_TYPE_U32,
-						   chain_entries,
-						   sizeof(union rdma_cqe),
-						   &cq->pbl, NULL);
+		rc = dev->ops->common->chain_alloc(dev->cdev, &cq->pbl,
+						   &chain_params);
 		if (rc)
-			goto err1;
+			goto err0;
 
 		page_cnt = qed_chain_get_page_cnt(&cq->pbl);
 		pbl_ptr = qed_chain_get_pbl_phys(&cq->pbl);
@@ -889,22 +987,29 @@
 
 	rc = dev->ops->rdma_create_cq(dev->rdma_ctx, &params, &icid);
 	if (rc)
-		goto err2;
+		goto err1;
 
 	cq->icid = icid;
 	cq->sig = QEDR_CQ_MAGIC_NUMBER;
 	spin_lock_init(&cq->cq_lock);
 
 	if (udata) {
-		rc = qedr_copy_cq_uresp(dev, cq, udata);
+		rc = qedr_copy_cq_uresp(dev, cq, udata, db_offset);
 		if (rc)
-			goto err3;
+			goto err2;
+
+		rc = qedr_db_recovery_add(dev, cq->q.db_addr,
+					  &cq->q.db_rec_data->db_data,
+					  DB_REC_WIDTH_64B,
+					  DB_REC_USER);
+		if (rc)
+			goto err2;
+
 	} else {
 		/* Generate doorbell address. */
-		cq->db_addr = dev->db_addr +
-		    DB_ADDR_SHIFT(DQ_PWM_OFFSET_UCM_RDMA_CQ_CONS_32BIT);
 		cq->db.data.icid = cq->icid;
-		cq->db.data.params = DB_AGG_CMD_SET <<
+		cq->db_addr = dev->db_addr + db_offset;
+		cq->db.data.params = DB_AGG_CMD_MAX <<
 		    RDMA_PWM_VAL32_DATA_AGG_CMD_SHIFT;
 
 		/* point to the very last element, passing it we will toggle */
@@ -913,6 +1018,11 @@
 		cq->latest_cqe = NULL;
 		consume_cqe(cq);
 		cq->cq_cons = qed_chain_get_cons_idx_u32(&cq->pbl);
+
+		rc = qedr_db_recovery_add(dev, cq->db_addr, &cq->db.data,
+					  DB_REC_WIDTH_64B, DB_REC_KERNEL);
+		if (rc)
+			goto err2;
 	}
 
 	DP_DEBUG(dev, QEDR_MSG_CQ,
@@ -921,18 +1031,19 @@
 
 	return 0;
 
-err3:
+err2:
 	destroy_iparams.icid = cq->icid;
 	dev->ops->rdma_destroy_cq(dev->rdma_ctx, &destroy_iparams,
 				  &destroy_oparams);
-err2:
-	if (udata)
-		qedr_free_pbl(dev, &cq->q.pbl_info, cq->q.pbl_tbl);
-	else
-		dev->ops->common->chain_free(dev->cdev, &cq->pbl);
 err1:
-	if (udata)
+	if (udata) {
+		qedr_free_pbl(dev, &cq->q.pbl_info, cq->q.pbl_tbl);
 		ib_umem_release(cq->q.umem);
+		if (cq->q.db_mmap_entry)
+			rdma_user_mmap_entry_remove(cq->q.db_mmap_entry);
+	} else {
+		dev->ops->common->chain_free(dev->cdev, &cq->pbl);
+	}
 err0:
 	return -EINVAL;
 }
@@ -950,7 +1061,7 @@
 #define QEDR_DESTROY_CQ_MAX_ITERATIONS		(10)
 #define QEDR_DESTROY_CQ_ITER_DURATION		(10)
 
-void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
 	struct qedr_dev *dev = get_qedr_dev(ibcq->device);
 	struct qed_rdma_destroy_cq_out_params oparams;
@@ -963,8 +1074,10 @@
 	cq->destroyed = 1;
 
 	/* GSIs CQs are handled by driver, so they don't exist in the FW */
-	if (cq->cq_type == QEDR_CQ_TYPE_GSI)
-		return;
+	if (cq->cq_type == QEDR_CQ_TYPE_GSI) {
+		qedr_db_recovery_del(dev, cq->db_addr, &cq->db.data);
+		return 0;
+	}
 
 	iparams.icid = cq->icid;
 	dev->ops->rdma_destroy_cq(dev->rdma_ctx, &iparams, &oparams);
@@ -973,6 +1086,14 @@
 	if (udata) {
 		qedr_free_pbl(dev, &cq->q.pbl_info, cq->q.pbl_tbl);
 		ib_umem_release(cq->q.umem);
+
+		if (cq->q.db_rec_data) {
+			qedr_db_recovery_del(dev, cq->q.db_addr,
+					     &cq->q.db_rec_data->db_data);
+			rdma_user_mmap_entry_remove(cq->q.db_mmap_entry);
+		}
+	} else {
+		qedr_db_recovery_del(dev, cq->db_addr, &cq->db.data);
 	}
 
 	/* We don't want the IRQ handler to handle a non-existing CQ so we
@@ -1003,6 +1124,7 @@
 	 * Since the destroy CQ ramrod has also been received on the EQ we can
 	 * be certain that there's no event handler in process.
 	 */
+	return 0;
 }
 
 static inline int get_gid_info_from_table(struct ib_qp *ibqp,
@@ -1035,7 +1157,7 @@
 		SET_FIELD(qp_params->modify_flags,
 			  QED_ROCE_MODIFY_QP_VALID_ROCE_MODE, 1);
 		break;
-	case RDMA_NETWORK_IB:
+	case RDMA_NETWORK_ROCE_V1:
 		memcpy(&qp_params->sgid.bytes[0], &gid_attr->gid.raw[0],
 		       sizeof(qp_params->sgid));
 		memcpy(&qp_params->dgid.bytes[0],
@@ -1055,6 +1177,8 @@
 			  QED_ROCE_MODIFY_QP_VALID_ROCE_MODE, 1);
 		qp_params->roce_mode = ROCE_V2_IPV4;
 		break;
+	default:
+		return -EINVAL;
 	}
 
 	for (i = 0; i < 4; i++) {
@@ -1075,11 +1199,14 @@
 	struct qedr_device_attr *qattr = &dev->attr;
 
 	/* QP0... attrs->qp_type == IB_QPT_GSI */
-	if (attrs->qp_type != IB_QPT_RC && attrs->qp_type != IB_QPT_GSI) {
+	if (attrs->qp_type != IB_QPT_RC &&
+	    attrs->qp_type != IB_QPT_GSI &&
+	    attrs->qp_type != IB_QPT_XRC_INI &&
+	    attrs->qp_type != IB_QPT_XRC_TGT) {
 		DP_DEBUG(dev, QEDR_MSG_QP,
 			 "create qp: unsupported qp type=0x%x requested\n",
 			 attrs->qp_type);
-		return -EINVAL;
+		return -EOPNOTSUPP;
 	}
 
 	if (attrs->cap.max_send_wr > qattr->max_sqe) {
@@ -1110,12 +1237,21 @@
 		return -EINVAL;
 	}
 
-	/* Unprivileged user space cannot create special QP */
-	if (udata && attrs->qp_type == IB_QPT_GSI) {
-		DP_ERR(dev,
-		       "create qp: userspace can't create special QPs of type=0x%x\n",
-		       attrs->qp_type);
-		return -EINVAL;
+	/* verify consumer QPs are not trying to use GSI QP's CQ.
+	 * TGT QP isn't associated with RQ/SQ
+	 */
+	if ((attrs->qp_type != IB_QPT_GSI) && (dev->gsi_qp_created) &&
+	    (attrs->qp_type != IB_QPT_XRC_TGT) &&
+	    (attrs->qp_type != IB_QPT_XRC_INI)) {
+		struct qedr_cq *send_cq = get_qedr_cq(attrs->send_cq);
+		struct qedr_cq *recv_cq = get_qedr_cq(attrs->recv_cq);
+
+		if ((send_cq->cq_type == QEDR_CQ_TYPE_GSI) ||
+		    (recv_cq->cq_type == QEDR_CQ_TYPE_GSI)) {
+			DP_ERR(dev,
+			       "create qp: consumer QP cannot use GSI CQs.\n");
+			return -EINVAL;
+		}
 	}
 
 	return 0;
@@ -1151,6 +1287,9 @@
 	}
 
 	uresp->rq_icid = qp->icid;
+	if (qp->urq.db_mmap_entry)
+		uresp->rq_db_rec_addr =
+			rdma_user_mmap_get_offset(qp->urq.db_mmap_entry);
 }
 
 static void qedr_copy_sq_uresp(struct qedr_dev *dev,
@@ -1164,22 +1303,30 @@
 		uresp->sq_icid = qp->icid;
 	else
 		uresp->sq_icid = qp->icid + 1;
+
+	if (qp->usq.db_mmap_entry)
+		uresp->sq_db_rec_addr =
+			rdma_user_mmap_get_offset(qp->usq.db_mmap_entry);
 }
 
 static int qedr_copy_qp_uresp(struct qedr_dev *dev,
-			      struct qedr_qp *qp, struct ib_udata *udata)
+			      struct qedr_qp *qp, struct ib_udata *udata,
+			      struct qedr_create_qp_uresp *uresp)
 {
-	struct qedr_create_qp_uresp uresp;
 	int rc;
 
-	memset(&uresp, 0, sizeof(uresp));
-	qedr_copy_sq_uresp(dev, &uresp, qp);
-	qedr_copy_rq_uresp(dev, &uresp, qp);
+	memset(uresp, 0, sizeof(*uresp));
 
-	uresp.atomic_supported = dev->atomic_cap != IB_ATOMIC_NONE;
-	uresp.qp_id = qp->qp_id;
+	if (qedr_qp_has_sq(qp))
+		qedr_copy_sq_uresp(dev, uresp, qp);
 
-	rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (qedr_qp_has_rq(qp))
+		qedr_copy_rq_uresp(dev, uresp, qp);
+
+	uresp->atomic_supported = dev->atomic_cap != IB_ATOMIC_NONE;
+	uresp->qp_id = qp->qp_id;
+
+	rc = qedr_ib_copy_to_udata(udata, uresp, sizeof(*uresp));
 	if (rc)
 		DP_ERR(dev,
 		       "create qp: failed a copy to user space with qp icid=0x%x.\n",
@@ -1198,18 +1345,25 @@
 		kref_init(&qp->refcnt);
 		init_completion(&qp->iwarp_cm_comp);
 	}
+
 	qp->pd = pd;
 	qp->qp_type = attrs->qp_type;
 	qp->max_inline_data = attrs->cap.max_inline_data;
-	qp->sq.max_sges = attrs->cap.max_send_sge;
 	qp->state = QED_ROCE_QP_STATE_RESET;
 	qp->signaled = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) ? true : false;
-	qp->sq_cq = get_qedr_cq(attrs->send_cq);
 	qp->dev = dev;
+	if (qedr_qp_has_sq(qp)) {
+		qp->sq.max_sges = attrs->cap.max_send_sge;
+		qp->sq_cq = get_qedr_cq(attrs->send_cq);
+		DP_DEBUG(dev, QEDR_MSG_QP,
+			 "SQ params:\tsq_max_sges = %d, sq_cq_id = %d\n",
+			 qp->sq.max_sges, qp->sq_cq->icid);
+	}
 
-	if (attrs->srq) {
+	if (attrs->srq)
 		qp->srq = get_qedr_srq(attrs->srq);
-	} else {
+
+	if (qedr_qp_has_rq(qp)) {
 		qp->rq_cq = get_qedr_cq(attrs->recv_cq);
 		qp->rq.max_sges = attrs->cap.max_recv_sge;
 		DP_DEBUG(dev, QEDR_MSG_QP,
@@ -1226,16 +1380,31 @@
 		 qp->sq.max_sges, qp->sq_cq->icid);
 }
 
-static void qedr_set_roce_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
+static int qedr_set_roce_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
 {
-	qp->sq.db = dev->db_addr +
-		    DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD);
-	qp->sq.db_data.data.icid = qp->icid + 1;
-	if (!qp->srq) {
+	int rc = 0;
+
+	if (qedr_qp_has_sq(qp)) {
+		qp->sq.db = dev->db_addr +
+			    DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD);
+		qp->sq.db_data.data.icid = qp->icid + 1;
+		rc = qedr_db_recovery_add(dev, qp->sq.db, &qp->sq.db_data,
+					  DB_REC_WIDTH_32B, DB_REC_KERNEL);
+		if (rc)
+			return rc;
+	}
+
+	if (qedr_qp_has_rq(qp)) {
 		qp->rq.db = dev->db_addr +
 			    DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_ROCE_RQ_PROD);
 		qp->rq.db_data.data.icid = qp->icid;
+		rc = qedr_db_recovery_add(dev, qp->rq.db, &qp->rq.db_data,
+					  DB_REC_WIDTH_32B, DB_REC_KERNEL);
+		if (rc && qedr_qp_has_sq(qp))
+			qedr_db_recovery_del(dev, qp->sq.db, &qp->sq.db_data);
 	}
+
+	return rc;
 }
 
 static int qedr_check_srq_params(struct qedr_dev *dev,
@@ -1255,6 +1424,10 @@
 		DP_ERR(dev,
 		       "create srq: unsupported sge=0x%x requested (max_srq_sge=0x%x)\n",
 		       attrs->attr.max_sge, qattr->max_sge);
+	}
+
+	if (!udata && attrs->srq_type == IB_SRQT_XRC) {
+		DP_ERR(dev, "XRC SRQs are not supported in kernel-space\n");
 		return -EINVAL;
 	}
 
@@ -1283,19 +1456,18 @@
 static int qedr_init_srq_user_params(struct ib_udata *udata,
 				     struct qedr_srq *srq,
 				     struct qedr_create_srq_ureq *ureq,
-				     int access, int dmasync)
+				     int access)
 {
 	struct scatterlist *sg;
 	int rc;
 
 	rc = qedr_init_user_queue(udata, srq->dev, &srq->usrq, ureq->srq_addr,
-				  ureq->srq_len, access, dmasync, 1);
+				  ureq->srq_len, false, access, 1);
 	if (rc)
 		return rc;
 
-	srq->prod_umem =
-		ib_umem_get(udata, ureq->prod_pair_addr,
-			    sizeof(struct rdma_srq_producers), access, dmasync);
+	srq->prod_umem = ib_umem_get(srq->ibsrq.device, ureq->prod_pair_addr,
+				     sizeof(struct rdma_srq_producers), access);
 	if (IS_ERR(srq->prod_umem)) {
 		qedr_free_pbl(srq->dev, &srq->usrq.pbl_info, srq->usrq.pbl_tbl);
 		ib_umem_release(srq->usrq.umem);
@@ -1316,6 +1488,12 @@
 					struct ib_srq_init_attr *init_attr)
 {
 	struct qedr_srq_hwq_info *hw_srq = &srq->hw_srq;
+	struct qed_chain_init_params params = {
+		.mode		= QED_CHAIN_MODE_PBL,
+		.intended_use	= QED_CHAIN_USE_TO_CONSUME_PRODUCE,
+		.cnt_type	= QED_CHAIN_CNT_TYPE_U32,
+		.elem_size	= QEDR_SRQ_WQE_ELEM_SIZE,
+	};
 	dma_addr_t phy_prod_pair_addr;
 	u32 num_elems;
 	void *va;
@@ -1334,13 +1512,9 @@
 	hw_srq->virt_prod_pair_addr = va;
 
 	num_elems = init_attr->attr.max_wr * RDMA_MAX_SRQ_WQE_SIZE;
-	rc = dev->ops->common->chain_alloc(dev->cdev,
-					   QED_CHAIN_USE_TO_CONSUME_PRODUCE,
-					   QED_CHAIN_MODE_PBL,
-					   QED_CHAIN_CNT_TYPE_U32,
-					   num_elems,
-					   QEDR_SRQ_WQE_ELEM_SIZE,
-					   &hw_srq->pbl, NULL);
+	params.num_elems = num_elems;
+
+	rc = dev->ops->common->chain_alloc(dev->cdev, &hw_srq->pbl, &params);
 	if (rc)
 		goto err0;
 
@@ -1378,6 +1552,7 @@
 		return -EINVAL;
 
 	srq->dev = dev;
+	srq->is_xrc = (init_attr->srq_type == IB_SRQT_XRC);
 	hw_srq = &srq->hw_srq;
 	spin_lock_init(&srq->lock);
 
@@ -1385,13 +1560,14 @@
 	hw_srq->max_sges = init_attr->attr.max_sge;
 
 	if (udata) {
-		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) {
+		if (ib_copy_from_udata(&ureq, udata, min(sizeof(ureq),
+							 udata->inlen))) {
 			DP_ERR(dev,
 			       "create srq: problem copying data from user space\n");
 			goto err0;
 		}
 
-		rc = qedr_init_srq_user_params(udata, srq, &ureq, 0, 0);
+		rc = qedr_init_srq_user_params(udata, srq, &ureq, 0);
 		if (rc)
 			goto err0;
 
@@ -1418,6 +1594,14 @@
 	in_params.prod_pair_addr = phy_prod_pair_addr;
 	in_params.num_pages = page_cnt;
 	in_params.page_size = page_size;
+	if (srq->is_xrc) {
+		struct qedr_xrcd *xrcd = get_qedr_xrcd(init_attr->ext.xrc.xrcd);
+		struct qedr_cq *cq = get_qedr_cq(init_attr->ext.cq);
+
+		in_params.is_xrc = 1;
+		in_params.xrcd_id = xrcd->xrcd_id;
+		in_params.cq_cid = cq->icid;
+	}
 
 	rc = dev->ops->rdma_create_srq(dev->rdma_ctx, &in_params, &out_params);
 	if (rc)
@@ -1452,7 +1636,7 @@
 	return -EFAULT;
 }
 
-void qedr_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
+int qedr_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
 	struct qed_rdma_destroy_srq_in_params in_params = {};
 	struct qedr_dev *dev = get_qedr_dev(ibsrq->device);
@@ -1460,6 +1644,7 @@
 
 	xa_erase_irq(&dev->srqs, srq->srq_id);
 	in_params.srq_id = srq->srq_id;
+	in_params.is_xrc = srq->is_xrc;
 	dev->ops->rdma_destroy_srq(dev->rdma_ctx, &in_params);
 
 	if (ibsrq->uobject)
@@ -1470,6 +1655,7 @@
 	DP_DEBUG(dev, QEDR_MSG_SRQ,
 		 "destroy srq: destroyed srq with srq_id=0x%0x\n",
 		 srq->srq_id);
+	return 0;
 }
 
 int qedr_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
@@ -1510,6 +1696,20 @@
 	return 0;
 }
 
+static enum qed_rdma_qp_type qedr_ib_to_qed_qp_type(enum ib_qp_type ib_qp_type)
+{
+	switch (ib_qp_type) {
+	case IB_QPT_RC:
+		return QED_RDMA_QP_TYPE_RC;
+	case IB_QPT_XRC_INI:
+		return QED_RDMA_QP_TYPE_XRC_INI;
+	case IB_QPT_XRC_TGT:
+		return QED_RDMA_QP_TYPE_XRC_TGT;
+	default:
+		return QED_RDMA_QP_TYPE_INVAL;
+	}
+}
+
 static inline void
 qedr_init_common_qp_in_params(struct qedr_dev *dev,
 			      struct qedr_pd *pd,
@@ -1524,20 +1724,27 @@
 
 	params->signal_all = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR);
 	params->fmr_and_reserved_lkey = fmr_and_reserved_lkey;
-	params->pd = pd->pd_id;
-	params->dpi = pd->uctx ? pd->uctx->dpi : dev->dpi;
-	params->sq_cq_id = get_qedr_cq(attrs->send_cq)->icid;
+	params->qp_type = qedr_ib_to_qed_qp_type(attrs->qp_type);
 	params->stats_queue = 0;
-	params->srq_id = 0;
-	params->use_srq = false;
 
-	if (!qp->srq) {
+	if (pd) {
+		params->pd = pd->pd_id;
+		params->dpi = pd->uctx ? pd->uctx->dpi : dev->dpi;
+	}
+
+	if (qedr_qp_has_sq(qp))
+		params->sq_cq_id = get_qedr_cq(attrs->send_cq)->icid;
+
+	if (qedr_qp_has_rq(qp))
 		params->rq_cq_id = get_qedr_cq(attrs->recv_cq)->icid;
 
-	} else {
+	if (qedr_qp_has_srq(qp)) {
 		params->rq_cq_id = get_qedr_cq(attrs->recv_cq)->icid;
 		params->srq_id = qp->srq->srq_id;
 		params->use_srq = true;
+	} else {
+		params->srq_id = 0;
+		params->use_srq = false;
 	}
 }
 
@@ -1551,8 +1758,10 @@
 		 "rq_len=%zd"
 		 "\n",
 		 qp,
-		 qp->usq.buf_addr,
-		 qp->usq.buf_len, qp->urq.buf_addr, qp->urq.buf_len);
+		 qedr_qp_has_sq(qp) ? qp->usq.buf_addr : 0x0,
+		 qedr_qp_has_sq(qp) ? qp->usq.buf_len : 0,
+		 qedr_qp_has_rq(qp) ? qp->urq.buf_addr : 0x0,
+		 qedr_qp_has_sq(qp) ? qp->urq.buf_len : 0);
 }
 
 static inline void
@@ -1574,13 +1783,19 @@
 			   &qp->urq.pbl_info, FW_PAGE_SHIFT);
 }
 
-static void qedr_cleanup_user(struct qedr_dev *dev, struct qedr_qp *qp)
+static void qedr_cleanup_user(struct qedr_dev *dev,
+			      struct qedr_ucontext *ctx,
+			      struct qedr_qp *qp)
 {
-	ib_umem_release(qp->usq.umem);
-	qp->usq.umem = NULL;
+	if (qedr_qp_has_sq(qp)) {
+		ib_umem_release(qp->usq.umem);
+		qp->usq.umem = NULL;
+	}
 
-	ib_umem_release(qp->urq.umem);
-	qp->urq.umem = NULL;
+	if (qedr_qp_has_rq(qp)) {
+		ib_umem_release(qp->urq.umem);
+		qp->urq.umem = NULL;
+	}
 
 	if (rdma_protocol_roce(&dev->ibdev, 1)) {
 		qedr_free_pbl(dev, &qp->usq.pbl_info, qp->usq.pbl_tbl);
@@ -1589,6 +1804,22 @@
 		kfree(qp->usq.pbl_tbl);
 		kfree(qp->urq.pbl_tbl);
 	}
+
+	if (qp->usq.db_rec_data) {
+		qedr_db_recovery_del(dev, qp->usq.db_addr,
+				     &qp->usq.db_rec_data->db_data);
+		rdma_user_mmap_entry_remove(qp->usq.db_mmap_entry);
+	}
+
+	if (qp->urq.db_rec_data) {
+		qedr_db_recovery_del(dev, qp->urq.db_addr,
+				     &qp->urq.db_rec_data->db_data);
+		rdma_user_mmap_entry_remove(qp->urq.db_mmap_entry);
+	}
+
+	if (rdma_protocol_iwarp(&dev->ibdev, 1))
+		qedr_db_recovery_del(dev, qp->urq.db_rec_db2_addr,
+				     &qp->urq.db_rec_db2_data);
 }
 
 static int qedr_create_user_qp(struct qedr_dev *dev,
@@ -1599,29 +1830,41 @@
 {
 	struct qed_rdma_create_qp_in_params in_params;
 	struct qed_rdma_create_qp_out_params out_params;
-	struct qedr_pd *pd = get_qedr_pd(ibpd);
-	struct qedr_create_qp_ureq ureq;
+	struct qedr_create_qp_uresp uresp = {};
+	struct qedr_create_qp_ureq ureq = {};
 	int alloc_and_init = rdma_protocol_roce(&dev->ibdev, 1);
-	int rc = -EINVAL;
+	struct qedr_ucontext *ctx = NULL;
+	struct qedr_pd *pd = NULL;
+	int rc = 0;
 
 	qp->create_type = QEDR_QP_CREATE_USER;
-	memset(&ureq, 0, sizeof(ureq));
-	rc = ib_copy_from_udata(&ureq, udata, sizeof(ureq));
-	if (rc) {
-		DP_ERR(dev, "Problem copying data from user space\n");
-		return rc;
+
+	if (ibpd) {
+		pd = get_qedr_pd(ibpd);
+		ctx = pd->uctx;
 	}
 
-	/* SQ - read access only (0), dma sync not required (0) */
-	rc = qedr_init_user_queue(udata, dev, &qp->usq, ureq.sq_addr,
-				  ureq.sq_len, 0, 0, alloc_and_init);
-	if (rc)
-		return rc;
+	if (udata) {
+		rc = ib_copy_from_udata(&ureq, udata, min(sizeof(ureq),
+					udata->inlen));
+		if (rc) {
+			DP_ERR(dev, "Problem copying data from user space\n");
+			return rc;
+		}
+	}
 
-	if (!qp->srq) {
-		/* RQ - read access only (0), dma sync not required (0) */
+	if (qedr_qp_has_sq(qp)) {
+		/* SQ - read access only (0) */
+		rc = qedr_init_user_queue(udata, dev, &qp->usq, ureq.sq_addr,
+					  ureq.sq_len, true, 0, alloc_and_init);
+		if (rc)
+			return rc;
+	}
+
+	if (qedr_qp_has_rq(qp)) {
+		/* RQ - read access only (0) */
 		rc = qedr_init_user_queue(udata, dev, &qp->urq, ureq.rq_addr,
-					  ureq.rq_len, 0, 0, alloc_and_init);
+					  ureq.rq_len, true, 0, alloc_and_init);
 		if (rc)
 			return rc;
 	}
@@ -1630,13 +1873,28 @@
 	qedr_init_common_qp_in_params(dev, pd, qp, attrs, false, &in_params);
 	in_params.qp_handle_lo = ureq.qp_handle_lo;
 	in_params.qp_handle_hi = ureq.qp_handle_hi;
-	in_params.sq_num_pages = qp->usq.pbl_info.num_pbes;
-	in_params.sq_pbl_ptr = qp->usq.pbl_tbl->pa;
-	if (!qp->srq) {
+
+	if (qp->qp_type == IB_QPT_XRC_TGT) {
+		struct qedr_xrcd *xrcd = get_qedr_xrcd(attrs->xrcd);
+
+		in_params.xrcd_id = xrcd->xrcd_id;
+		in_params.qp_handle_lo = qp->qp_id;
+		in_params.use_srq = 1;
+	}
+
+	if (qedr_qp_has_sq(qp)) {
+		in_params.sq_num_pages = qp->usq.pbl_info.num_pbes;
+		in_params.sq_pbl_ptr = qp->usq.pbl_tbl->pa;
+	}
+
+	if (qedr_qp_has_rq(qp)) {
 		in_params.rq_num_pages = qp->urq.pbl_info.num_pbes;
 		in_params.rq_pbl_ptr = qp->urq.pbl_tbl->pa;
 	}
 
+	if (ctx)
+		SET_FIELD(in_params.flags, QED_ROCE_EDPM_MODE, ctx->edpm_mode);
+
 	qp->qed_qp = dev->ops->rdma_create_qp(dev->rdma_ctx,
 					      &in_params, &out_params);
 
@@ -1651,29 +1909,79 @@
 	qp->qp_id = out_params.qp_id;
 	qp->icid = out_params.icid;
 
-	rc = qedr_copy_qp_uresp(dev, qp, udata);
-	if (rc)
-		goto err;
+	if (udata) {
+		rc = qedr_copy_qp_uresp(dev, qp, udata, &uresp);
+		if (rc)
+			goto err;
+	}
 
+	/* db offset was calculated in copy_qp_uresp, now set in the user q */
+	if (qedr_qp_has_sq(qp)) {
+		qp->usq.db_addr = ctx->dpi_addr + uresp.sq_db_offset;
+		qp->sq.max_wr = attrs->cap.max_send_wr;
+		rc = qedr_db_recovery_add(dev, qp->usq.db_addr,
+					  &qp->usq.db_rec_data->db_data,
+					  DB_REC_WIDTH_32B,
+					  DB_REC_USER);
+		if (rc)
+			goto err;
+	}
+
+	if (qedr_qp_has_rq(qp)) {
+		qp->urq.db_addr = ctx->dpi_addr + uresp.rq_db_offset;
+		qp->rq.max_wr = attrs->cap.max_recv_wr;
+		rc = qedr_db_recovery_add(dev, qp->urq.db_addr,
+					  &qp->urq.db_rec_data->db_data,
+					  DB_REC_WIDTH_32B,
+					  DB_REC_USER);
+		if (rc)
+			goto err;
+	}
+
+	if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
+		qp->urq.db_rec_db2_addr = ctx->dpi_addr + uresp.rq_db2_offset;
+
+		/* calculate the db_rec_db2 data since it is constant so no
+		 * need to reflect from user
+		 */
+		qp->urq.db_rec_db2_data.data.icid = cpu_to_le16(qp->icid);
+		qp->urq.db_rec_db2_data.data.value =
+			cpu_to_le16(DQ_TCM_IWARP_POST_RQ_CF_CMD);
+
+		rc = qedr_db_recovery_add(dev, qp->urq.db_rec_db2_addr,
+					  &qp->urq.db_rec_db2_data,
+					  DB_REC_WIDTH_32B,
+					  DB_REC_USER);
+		if (rc)
+			goto err;
+	}
 	qedr_qp_user_print(dev, qp);
-
-	return 0;
+	return rc;
 err:
 	rc = dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
 	if (rc)
 		DP_ERR(dev, "create qp: fatal fault. rc=%d", rc);
 
 err1:
-	qedr_cleanup_user(dev, qp);
+	qedr_cleanup_user(dev, ctx, qp);
 	return rc;
 }
 
-static void qedr_set_iwarp_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
+static int qedr_set_iwarp_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
 {
+	int rc;
+
 	qp->sq.db = dev->db_addr +
 	    DB_ADDR_SHIFT(DQ_PWM_OFFSET_XCM_RDMA_SQ_PROD);
 	qp->sq.db_data.data.icid = qp->icid;
 
+	rc = qedr_db_recovery_add(dev, qp->sq.db,
+				  &qp->sq.db_data,
+				  DB_REC_WIDTH_32B,
+				  DB_REC_KERNEL);
+	if (rc)
+		return rc;
+
 	qp->rq.db = dev->db_addr +
 		    DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_IWARP_RQ_PROD);
 	qp->rq.db_data.data.icid = qp->icid;
@@ -1681,6 +1989,19 @@
 			   DB_ADDR_SHIFT(DQ_PWM_OFFSET_TCM_FLAGS);
 	qp->rq.iwarp_db2_data.data.icid = qp->icid;
 	qp->rq.iwarp_db2_data.data.value = DQ_TCM_IWARP_POST_RQ_CF_CMD;
+
+	rc = qedr_db_recovery_add(dev, qp->rq.db,
+				  &qp->rq.db_data,
+				  DB_REC_WIDTH_32B,
+				  DB_REC_KERNEL);
+	if (rc)
+		return rc;
+
+	rc = qedr_db_recovery_add(dev, qp->rq.iwarp_db2,
+				  &qp->rq.iwarp_db2_data,
+				  DB_REC_WIDTH_32B,
+				  DB_REC_KERNEL);
+	return rc;
 }
 
 static int
@@ -1690,29 +2011,28 @@
 			   u32 n_sq_elems, u32 n_rq_elems)
 {
 	struct qed_rdma_create_qp_out_params out_params;
+	struct qed_chain_init_params params = {
+		.mode		= QED_CHAIN_MODE_PBL,
+		.cnt_type	= QED_CHAIN_CNT_TYPE_U32,
+	};
 	int rc;
 
-	rc = dev->ops->common->chain_alloc(dev->cdev,
-					   QED_CHAIN_USE_TO_PRODUCE,
-					   QED_CHAIN_MODE_PBL,
-					   QED_CHAIN_CNT_TYPE_U32,
-					   n_sq_elems,
-					   QEDR_SQE_ELEMENT_SIZE,
-					   &qp->sq.pbl, NULL);
+	params.intended_use = QED_CHAIN_USE_TO_PRODUCE;
+	params.num_elems = n_sq_elems;
+	params.elem_size = QEDR_SQE_ELEMENT_SIZE;
 
+	rc = dev->ops->common->chain_alloc(dev->cdev, &qp->sq.pbl, &params);
 	if (rc)
 		return rc;
 
 	in_params->sq_num_pages = qed_chain_get_page_cnt(&qp->sq.pbl);
 	in_params->sq_pbl_ptr = qed_chain_get_pbl_phys(&qp->sq.pbl);
 
-	rc = dev->ops->common->chain_alloc(dev->cdev,
-					   QED_CHAIN_USE_TO_CONSUME_PRODUCE,
-					   QED_CHAIN_MODE_PBL,
-					   QED_CHAIN_CNT_TYPE_U32,
-					   n_rq_elems,
-					   QEDR_RQE_ELEMENT_SIZE,
-					   &qp->rq.pbl, NULL);
+	params.intended_use = QED_CHAIN_USE_TO_CONSUME_PRODUCE;
+	params.num_elems = n_rq_elems;
+	params.elem_size = QEDR_RQE_ELEMENT_SIZE;
+
+	rc = dev->ops->common->chain_alloc(dev->cdev, &qp->rq.pbl, &params);
 	if (rc)
 		return rc;
 
@@ -1728,8 +2048,7 @@
 	qp->qp_id = out_params.qp_id;
 	qp->icid = out_params.icid;
 
-	qedr_set_roce_db_info(dev, qp);
-	return rc;
+	return qedr_set_roce_db_info(dev, qp);
 }
 
 static int
@@ -1739,14 +2058,19 @@
 			    u32 n_sq_elems, u32 n_rq_elems)
 {
 	struct qed_rdma_create_qp_out_params out_params;
-	struct qed_chain_ext_pbl ext_pbl;
+	struct qed_chain_init_params params = {
+		.mode		= QED_CHAIN_MODE_PBL,
+		.cnt_type	= QED_CHAIN_CNT_TYPE_U32,
+	};
 	int rc;
 
 	in_params->sq_num_pages = QED_CHAIN_PAGE_CNT(n_sq_elems,
 						     QEDR_SQE_ELEMENT_SIZE,
+						     QED_CHAIN_PAGE_SIZE,
 						     QED_CHAIN_MODE_PBL);
 	in_params->rq_num_pages = QED_CHAIN_PAGE_CNT(n_rq_elems,
 						     QEDR_RQE_ELEMENT_SIZE,
+						     QED_CHAIN_PAGE_SIZE,
 						     QED_CHAIN_MODE_PBL);
 
 	qp->qed_qp = dev->ops->rdma_create_qp(dev->rdma_ctx,
@@ -1756,39 +2080,31 @@
 		return -EINVAL;
 
 	/* Now we allocate the chain */
-	ext_pbl.p_pbl_virt = out_params.sq_pbl_virt;
-	ext_pbl.p_pbl_phys = out_params.sq_pbl_phys;
 
-	rc = dev->ops->common->chain_alloc(dev->cdev,
-					   QED_CHAIN_USE_TO_PRODUCE,
-					   QED_CHAIN_MODE_PBL,
-					   QED_CHAIN_CNT_TYPE_U32,
-					   n_sq_elems,
-					   QEDR_SQE_ELEMENT_SIZE,
-					   &qp->sq.pbl, &ext_pbl);
+	params.intended_use = QED_CHAIN_USE_TO_PRODUCE;
+	params.num_elems = n_sq_elems;
+	params.elem_size = QEDR_SQE_ELEMENT_SIZE;
+	params.ext_pbl_virt = out_params.sq_pbl_virt;
+	params.ext_pbl_phys = out_params.sq_pbl_phys;
 
+	rc = dev->ops->common->chain_alloc(dev->cdev, &qp->sq.pbl, &params);
 	if (rc)
 		goto err;
 
-	ext_pbl.p_pbl_virt = out_params.rq_pbl_virt;
-	ext_pbl.p_pbl_phys = out_params.rq_pbl_phys;
+	params.intended_use = QED_CHAIN_USE_TO_CONSUME_PRODUCE;
+	params.num_elems = n_rq_elems;
+	params.elem_size = QEDR_RQE_ELEMENT_SIZE;
+	params.ext_pbl_virt = out_params.rq_pbl_virt;
+	params.ext_pbl_phys = out_params.rq_pbl_phys;
 
-	rc = dev->ops->common->chain_alloc(dev->cdev,
-					   QED_CHAIN_USE_TO_CONSUME_PRODUCE,
-					   QED_CHAIN_MODE_PBL,
-					   QED_CHAIN_CNT_TYPE_U32,
-					   n_rq_elems,
-					   QEDR_RQE_ELEMENT_SIZE,
-					   &qp->rq.pbl, &ext_pbl);
-
+	rc = dev->ops->common->chain_alloc(dev->cdev, &qp->rq.pbl, &params);
 	if (rc)
 		goto err;
 
 	qp->qp_id = out_params.qp_id;
 	qp->icid = out_params.icid;
 
-	qedr_set_iwarp_db_info(dev, qp);
-	return rc;
+	return qedr_set_iwarp_db_info(dev, qp);
 
 err:
 	dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
@@ -1803,6 +2119,20 @@
 
 	dev->ops->common->chain_free(dev->cdev, &qp->rq.pbl);
 	kfree(qp->rqe_wr_id);
+
+	/* GSI qp is not registered to db mechanism so no need to delete */
+	if (qp->qp_type == IB_QPT_GSI)
+		return;
+
+	qedr_db_recovery_del(dev, qp->sq.db, &qp->sq.db_data);
+
+	if (!qp->srq) {
+		qedr_db_recovery_del(dev, qp->rq.db, &qp->rq.db_data);
+
+		if (rdma_protocol_iwarp(&dev->ibdev, 1))
+			qedr_db_recovery_del(dev, qp->rq.iwarp_db2,
+					     &qp->rq.iwarp_db2_data);
+	}
 }
 
 static int qedr_create_kernel_qp(struct qedr_dev *dev,
@@ -1881,16 +2211,47 @@
 	return rc;
 }
 
+static int qedr_free_qp_resources(struct qedr_dev *dev, struct qedr_qp *qp,
+				  struct ib_udata *udata)
+{
+	struct qedr_ucontext *ctx =
+		rdma_udata_to_drv_context(udata, struct qedr_ucontext,
+					  ibucontext);
+	int rc;
+
+	if (qp->qp_type != IB_QPT_GSI) {
+		rc = dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
+		if (rc)
+			return rc;
+	}
+
+	if (qp->create_type == QEDR_QP_CREATE_USER)
+		qedr_cleanup_user(dev, ctx, qp);
+	else
+		qedr_cleanup_kernel(dev, qp);
+
+	return 0;
+}
+
 struct ib_qp *qedr_create_qp(struct ib_pd *ibpd,
 			     struct ib_qp_init_attr *attrs,
 			     struct ib_udata *udata)
 {
-	struct qedr_dev *dev = get_qedr_dev(ibpd->device);
-	struct qedr_pd *pd = get_qedr_pd(ibpd);
+	struct qedr_xrcd *xrcd = NULL;
+	struct qedr_pd *pd = NULL;
+	struct qedr_dev *dev;
 	struct qedr_qp *qp;
 	struct ib_qp *ibqp;
 	int rc = 0;
 
+	if (attrs->qp_type == IB_QPT_XRC_TGT) {
+		xrcd = get_qedr_xrcd(attrs->xrcd);
+		dev = get_qedr_dev(xrcd->ibxrcd.device);
+	} else {
+		pd = get_qedr_pd(ibpd);
+		dev = get_qedr_dev(ibpd->device);
+	}
+
 	DP_DEBUG(dev, QEDR_MSG_QP, "create qp: called from %s, pd=%p\n",
 		 udata ? "user library" : "kernel", pd);
 
@@ -1921,25 +2282,27 @@
 		return ibqp;
 	}
 
-	if (udata)
+	if (udata || xrcd)
 		rc = qedr_create_user_qp(dev, qp, ibpd, udata, attrs);
 	else
 		rc = qedr_create_kernel_qp(dev, qp, ibpd, attrs);
 
 	if (rc)
-		goto err;
+		goto out_free_qp;
 
 	qp->ibqp.qp_num = qp->qp_id;
 
 	if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
 		rc = xa_insert(&dev->qps, qp->qp_id, qp, GFP_KERNEL);
 		if (rc)
-			goto err;
+			goto out_free_qp_resources;
 	}
 
 	return &qp->ibqp;
 
-err:
+out_free_qp_resources:
+	qedr_free_qp_resources(dev, qp, udata);
+out_free_qp:
 	kfree(qp);
 
 	return ERR_PTR(-EFAULT);
@@ -2383,15 +2746,18 @@
 	int rc = 0;
 
 	memset(&params, 0, sizeof(params));
-
-	rc = dev->ops->rdma_query_qp(dev->rdma_ctx, qp->qed_qp, &params);
-	if (rc)
-		goto err;
-
 	memset(qp_attr, 0, sizeof(*qp_attr));
 	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
 
-	qp_attr->qp_state = qedr_get_ibqp_state(params.state);
+	if (qp->qp_type != IB_QPT_GSI) {
+		rc = dev->ops->rdma_query_qp(dev->rdma_ctx, qp->qed_qp, &params);
+		if (rc)
+			goto err;
+		qp_attr->qp_state = qedr_get_ibqp_state(params.state);
+	} else {
+		qp_attr->qp_state = qedr_get_ibqp_state(QED_ROCE_QP_STATE_RTS);
+	}
+
 	qp_attr->cur_qp_state = qedr_get_ibqp_state(params.state);
 	qp_attr->path_mtu = ib_mtu_int_to_enum(params.mtu);
 	qp_attr->path_mig_state = IB_MIG_MIGRATED;
@@ -2440,25 +2806,6 @@
 	return rc;
 }
 
-static int qedr_free_qp_resources(struct qedr_dev *dev, struct qedr_qp *qp,
-				  struct ib_udata *udata)
-{
-	int rc = 0;
-
-	if (qp->qp_type != IB_QPT_GSI) {
-		rc = dev->ops->rdma_destroy_qp(dev->rdma_ctx, qp->qed_qp);
-		if (rc)
-			return rc;
-	}
-
-	if (qp->create_type == QEDR_QP_CREATE_USER)
-		qedr_cleanup_user(dev, qp);
-	else
-		qedr_cleanup_kernel(dev, qp);
-
-	return 0;
-}
-
 int qedr_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
 	struct qedr_qp *qp = get_qedr_qp(ibqp);
@@ -2524,21 +2871,22 @@
 	return 0;
 }
 
-int qedr_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr, u32 flags,
+int qedr_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
 		   struct ib_udata *udata)
 {
 	struct qedr_ah *ah = get_qedr_ah(ibah);
 
-	rdma_copy_ah_attr(&ah->attr, attr);
+	rdma_copy_ah_attr(&ah->attr, init_attr->ah_attr);
 
 	return 0;
 }
 
-void qedr_destroy_ah(struct ib_ah *ibah, u32 flags)
+int qedr_destroy_ah(struct ib_ah *ibah, u32 flags)
 {
 	struct qedr_ah *ah = get_qedr_ah(ibah);
 
 	rdma_destroy_ah_attr(&ah->attr);
+	return 0;
 }
 
 static void free_mr_info(struct qedr_dev *dev, struct mr_info *info)
@@ -2623,13 +2971,14 @@
 
 	mr->type = QEDR_MR_USER;
 
-	mr->umem = ib_umem_get(udata, start, len, acc, 0);
+	mr->umem = ib_umem_get(ibpd->device, start, len, acc);
 	if (IS_ERR(mr->umem)) {
 		rc = -EFAULT;
 		goto err0;
 	}
 
-	rc = init_mr_info(dev, &mr->info, ib_umem_page_count(mr->umem), 1);
+	rc = init_mr_info(dev, &mr->info,
+			  ib_umem_num_dma_blocks(mr->umem, PAGE_SIZE), 1);
 	if (rc)
 		goto err1;
 
@@ -2656,10 +3005,8 @@
 	mr->hw_mr.pbl_two_level = mr->info.pbl_info.two_layered;
 	mr->hw_mr.pbl_page_size_log = ilog2(mr->info.pbl_info.pbl_size);
 	mr->hw_mr.page_size_log = PAGE_SHIFT;
-	mr->hw_mr.fbo = ib_umem_offset(mr->umem);
 	mr->hw_mr.length = len;
 	mr->hw_mr.vaddr = usr_addr;
-	mr->hw_mr.zbva = false;
 	mr->hw_mr.phy_mr = false;
 	mr->hw_mr.dma_mr = false;
 
@@ -2752,10 +3099,8 @@
 	mr->hw_mr.pbl_ptr = 0;
 	mr->hw_mr.pbl_two_level = mr->info.pbl_info.two_layered;
 	mr->hw_mr.pbl_page_size_log = ilog2(mr->info.pbl_info.pbl_size);
-	mr->hw_mr.fbo = 0;
 	mr->hw_mr.length = 0;
 	mr->hw_mr.vaddr = 0;
-	mr->hw_mr.zbva = false;
 	mr->hw_mr.phy_mr = true;
 	mr->hw_mr.dma_mr = false;
 
@@ -2779,7 +3124,7 @@
 }
 
 struct ib_mr *qedr_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
-			    u32 max_num_sg, struct ib_udata *udata)
+			    u32 max_num_sg)
 {
 	struct qedr_mr *mr;
 
@@ -3296,7 +3641,7 @@
 		break;
 	case IB_WR_RDMA_READ_WITH_INV:
 		SET_FIELD2(wqe->flags, RDMA_SQ_RDMA_WQE_1ST_READ_INV_FLG, 1);
-		/* fallthrough -- same is identical to RDMA READ */
+		fallthrough;	/* same is identical to RDMA READ */
 
 	case IB_WR_RDMA_READ:
 		wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_RD;
@@ -3533,10 +3878,10 @@
 		 * in first 4 bytes and need to update WQE producer in
 		 * next 4 bytes.
 		 */
-		srq->hw_srq.virt_prod_pair_addr->sge_prod = hw_srq->sge_prod;
+		srq->hw_srq.virt_prod_pair_addr->sge_prod = cpu_to_le32(hw_srq->sge_prod);
 		/* Make sure sge producer is updated first */
 		dma_wmb();
-		srq->hw_srq.virt_prod_pair_addr->wqe_prod = hw_srq->wqe_prod;
+		srq->hw_srq.virt_prod_pair_addr->wqe_prod = cpu_to_le32(hw_srq->wqe_prod);
 
 		wr = wr->next;
 	}
@@ -4130,19 +4475,10 @@
 }
 
 int qedr_process_mad(struct ib_device *ibdev, int process_mad_flags,
-		     u8 port_num,
-		     const struct ib_wc *in_wc,
-		     const struct ib_grh *in_grh,
-		     const struct ib_mad_hdr *mad_hdr,
-		     size_t in_mad_size, struct ib_mad_hdr *out_mad,
-		     size_t *out_mad_size, u16 *out_mad_pkey_index)
+		     u8 port_num, const struct ib_wc *in_wc,
+		     const struct ib_grh *in_grh, const struct ib_mad *in,
+		     struct ib_mad *out_mad, size_t *out_mad_size,
+		     u16 *out_mad_pkey_index)
 {
-	struct qedr_dev *dev = get_qedr_dev(ibdev);
-
-	DP_DEBUG(dev, QEDR_MSG_GSI,
-		 "QEDR_PROCESS_MAD in_mad %x %x %x %x %x %x %x %x\n",
-		 mad_hdr->attr_id, mad_hdr->base_version, mad_hdr->attr_mod,
-		 mad_hdr->class_specific, mad_hdr->class_version,
-		 mad_hdr->method, mad_hdr->mgmt_class, mad_hdr->status);
 	return IB_MAD_RESULT_SUCCESS;
 }
diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h
index 9aaa902..2672c32 100644
--- a/drivers/infiniband/hw/qedr/verbs.h
+++ b/drivers/infiniband/hw/qedr/verbs.h
@@ -35,8 +35,6 @@
 int qedr_query_device(struct ib_device *ibdev,
 		      struct ib_device_attr *attr, struct ib_udata *udata);
 int qedr_query_port(struct ib_device *, u8 port, struct ib_port_attr *props);
-int qedr_modify_port(struct ib_device *, u8 port, int mask,
-		     struct ib_port_modify *props);
 
 int qedr_iw_query_gid(struct ib_device *ibdev, u8 port,
 		      int index, union ib_gid *gid);
@@ -46,14 +44,16 @@
 int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
 void qedr_dealloc_ucontext(struct ib_ucontext *uctx);
 
-int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
+int qedr_mmap(struct ib_ucontext *ucontext, struct vm_area_struct *vma);
+void qedr_mmap_free(struct rdma_user_mmap_entry *rdma_entry);
 int qedr_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
-void qedr_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
-
+int qedr_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+int qedr_alloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata);
+int qedr_dealloc_xrcd(struct ib_xrcd *ibxrcd, struct ib_udata *udata);
 int qedr_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		   struct ib_udata *udata);
 int qedr_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
-void qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 struct ib_qp *qedr_create_qp(struct ib_pd *, struct ib_qp_init_attr *attrs,
 			     struct ib_udata *);
@@ -68,12 +68,12 @@
 int qedr_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		    enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
 int qedr_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
-void qedr_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
+int qedr_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
 int qedr_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 		       const struct ib_recv_wr **bad_recv_wr);
-int qedr_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr, u32 flags,
+int qedr_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
 		   struct ib_udata *udata);
-void qedr_destroy_ah(struct ib_ah *ibah, u32 flags);
+int qedr_destroy_ah(struct ib_ah *ibah, u32 flags);
 
 int qedr_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
 struct ib_mr *qedr_get_dma_mr(struct ib_pd *, int acc);
@@ -85,7 +85,7 @@
 		   int sg_nents, unsigned int *sg_offset);
 
 struct ib_mr *qedr_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			    u32 max_num_sg, struct ib_udata *udata);
+			    u32 max_num_sg);
 int qedr_poll_cq(struct ib_cq *, int num_entries, struct ib_wc *wc);
 int qedr_post_send(struct ib_qp *, const struct ib_send_wr *,
 		   const struct ib_send_wr **bad_wr);
@@ -93,10 +93,9 @@
 		   const struct ib_recv_wr **bad_wr);
 int qedr_process_mad(struct ib_device *ibdev, int process_mad_flags,
 		     u8 port_num, const struct ib_wc *in_wc,
-		     const struct ib_grh *in_grh,
-		     const struct ib_mad_hdr *in_mad,
-		     size_t in_mad_size, struct ib_mad_hdr *out_mad,
-		     size_t *out_mad_size, u16 *out_mad_pkey_index);
+		     const struct ib_grh *in_grh, const struct ib_mad *in_mad,
+		     struct ib_mad *out_mad, size_t *out_mad_size,
+		     u16 *out_mad_pkey_index);
 
 int qedr_port_immutable(struct ib_device *ibdev, u8 port_num,
 			struct ib_port_immutable *immutable);
diff --git a/drivers/infiniband/hw/qib/Kconfig b/drivers/infiniband/hw/qib/Kconfig
index 376d19f..6c48957 100644
--- a/drivers/infiniband/hw/qib/Kconfig
+++ b/drivers/infiniband/hw/qib/Kconfig
@@ -3,7 +3,7 @@
 	tristate "Intel PCIe HCA support"
 	depends on 64BIT && INFINIBAND_RDMAVT
 	depends on PCI
-	---help---
+	help
 	This is a low-level driver for Intel PCIe QLE InfiniBand host
 	channel adapters.  This driver does not support the Intel
 	HyperTransport card (model QHT7140).
@@ -12,6 +12,6 @@
 	bool "QIB DCA support"
 	depends on INFINIBAND_QIB && DCA && SMP && !(INFINIBAND_QIB=y && DCA=m)
 	default y
-	---help---
+	help
 	Setting this enables DCA support on some Intel chip sets
 	with the iba7322 HCA.
diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h
index 432d6d0..ee21142 100644
--- a/drivers/infiniband/hw/qib/qib.h
+++ b/drivers/infiniband/hw/qib/qib.h
@@ -619,11 +619,11 @@
 	/* LID mask control */
 	u8 lmc;
 	u8 link_width_supported;
-	u8 link_speed_supported;
+	u16 link_speed_supported;
 	u8 link_width_enabled;
-	u8 link_speed_enabled;
+	u16 link_speed_enabled;
 	u8 link_width_active;
-	u8 link_speed_active;
+	u16 link_speed_active;
 	u8 vls_supported;
 	u8 vls_operational;
 	/* Rx Polarity inversion (compensate for ~tx on partner) */
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index b014422..ff87a67 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -40,10 +40,10 @@
 #include <linux/highmem.h>
 #include <linux/io.h>
 #include <linux/jiffies.h>
-#include <asm/pgtable.h>
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/uio.h>
+#include <linux/pgtable.h>
 
 #include <rdma/ib.h>
 
diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c
index 531d8a1..44150be 100644
--- a/drivers/infiniband/hw/qib/qib_iba6120.c
+++ b/drivers/infiniband/hw/qib/qib_iba6120.c
@@ -1417,7 +1417,6 @@
  *
  * The exact combo of LEDs if on is true is determined by looking
  * at the ibcstatus.
-
  * These LEDs indicate the physical and logical state of IB link.
  * For this chip (at least with recommended board pinouts), LED1
  * is Yellow (logical state) and LED2 is Green (physical state),
@@ -2974,11 +2973,11 @@
 		state = IB_PORT_ARMED;
 		break;
 	case IB_6120_L_STATE_ACTIVE:
-		/* fall through */
 	case IB_6120_L_STATE_ACT_DEFER:
 		state = IB_PORT_ACTIVE;
 		break;
-	default: /* fall through */
+	default:
+		fallthrough;
 	case IB_6120_L_STATE_DOWN:
 		state = IB_PORT_DOWN;
 		break;
diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c
index ea3ddb0..0a6f26d 100644
--- a/drivers/infiniband/hw/qib/qib_iba7220.c
+++ b/drivers/infiniband/hw/qib/qib_iba7220.c
@@ -3586,11 +3586,11 @@
 		state = IB_PORT_ARMED;
 		break;
 	case IB_7220_L_STATE_ACTIVE:
-		/* fall through */
 	case IB_7220_L_STATE_ACT_DEFER:
 		state = IB_PORT_ACTIVE;
 		break;
-	default: /* fall through */
+	default:
+		fallthrough;
 	case IB_7220_L_STATE_DOWN:
 		state = IB_PORT_DOWN;
 		break;
diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c
index dd48433..189a0ce 100644
--- a/drivers/infiniband/hw/qib/qib_iba7322.c
+++ b/drivers/infiniband/hw/qib/qib_iba7322.c
@@ -1733,9 +1733,9 @@
 	return;
 }
 
-static void qib_error_tasklet(unsigned long data)
+static void qib_error_tasklet(struct tasklet_struct *t)
 {
-	struct qib_devdata *dd = (struct qib_devdata *)data;
+	struct qib_devdata *dd = from_tasklet(dd, t, error_tasklet);
 
 	handle_7322_errors(dd);
 	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
@@ -2375,7 +2375,6 @@
 	struct qib_devdata *dd = ppd->dd;
 	u64 val, guid, ibc;
 	unsigned long flags;
-	int ret = 0;
 
 	/*
 	 * SerDes model not in Pd, but still need to
@@ -2510,7 +2509,7 @@
 		val | ERR_MASK_N(IBStatusChanged));
 
 	/* Always zero until we start messing with SerDes for real */
-	return ret;
+	return 0;
 }
 
 /**
@@ -3538,8 +3537,7 @@
 	for (i = 0; i < ARRAY_SIZE(redirect); i++)
 		qib_write_kreg(dd, kr_intredirect + i, redirect[i]);
 	dd->cspec->main_int_mask = mask;
-	tasklet_init(&dd->error_tasklet, qib_error_tasklet,
-		(unsigned long)dd);
+	tasklet_setup(&dd->error_tasklet, qib_error_tasklet);
 }
 
 /**
@@ -5509,11 +5507,11 @@
 		state = IB_PORT_ARMED;
 		break;
 	case IB_7322_L_STATE_ACTIVE:
-		/* fall through */
 	case IB_7322_L_STATE_ACT_DEFER:
 		state = IB_PORT_ACTIVE;
 		break;
-	default: /* fall through */
+	default:
+		fallthrough;
 	case IB_7322_L_STATE_DOWN:
 		state = IB_PORT_DOWN;
 		break;
@@ -6534,7 +6532,7 @@
 				    "Invalid num_vls %u, using 4 VLs\n",
 				    qib_num_cfg_vls);
 			qib_num_cfg_vls = 4;
-			/* fall through */
+			fallthrough;
 		case 4:
 			ppd->vls_supported = IB_VL_VL0_3;
 			break;
@@ -6630,7 +6628,7 @@
 	/* vl15 buffers start just after the 4k buffers */
 	vl15off = dd->physaddr + (dd->piobufbase >> 32) +
 		  dd->piobcnt4k * dd->align4k;
-	dd->piovl15base	= ioremap_nocache(vl15off,
+	dd->piovl15base	= ioremap(vl15off,
 					  NUM_VL15_BUFS * dd->align4k);
 	if (!dd->piovl15base) {
 		ret = -ENOMEM;
@@ -6875,7 +6873,7 @@
 	struct qib_devdata *dd = ppd->dd;
 	unsigned lastbuf, erstbuf;
 	u64 senddmabufmask[3] = { 0 };
-	int n, ret = 0;
+	int n;
 
 	qib_write_kreg_port(ppd, krp_senddmabase, ppd->sdma_descq_phys);
 	qib_sdma_7322_setlengen(ppd);
@@ -6904,7 +6902,7 @@
 	qib_write_kreg_port(ppd, krp_senddmabufmask0, senddmabufmask[0]);
 	qib_write_kreg_port(ppd, krp_senddmabufmask1, senddmabufmask[1]);
 	qib_write_kreg_port(ppd, krp_senddmabufmask2, senddmabufmask[2]);
-	return ret;
+	return 0;
 }
 
 /* sdma_lock must be held */
diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c
index d4fd8a6..43c8ee1 100644
--- a/drivers/infiniband/hw/qib/qib_init.c
+++ b/drivers/infiniband/hw/qib/qib_init.c
@@ -1759,7 +1759,7 @@
 		qib_userlen = dd->ureg_align * dd->cfgctxts;
 
 	/* Sanity checks passed, now create the new mappings */
-	qib_kregbase = ioremap_nocache(qib_physaddr, qib_kreglen);
+	qib_kregbase = ioremap(qib_physaddr, qib_kreglen);
 	if (!qib_kregbase)
 		goto bail;
 
@@ -1768,7 +1768,7 @@
 		goto bail_kregbase;
 
 	if (qib_userlen) {
-		qib_userbase = ioremap_nocache(qib_physaddr + dd->uregbase,
+		qib_userbase = ioremap(qib_physaddr + dd->uregbase,
 					       qib_userlen);
 		if (!qib_userbase)
 			goto bail_piobase;
diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c
index f92faf5..f83e331 100644
--- a/drivers/infiniband/hw/qib/qib_mad.c
+++ b/drivers/infiniband/hw/qib/qib_mad.c
@@ -433,7 +433,7 @@
 			/* Bad mkey not a violation below level 2 */
 			if (ibp->rvp.mkeyprot < 2)
 				break;
-			/* fall through */
+			fallthrough;
 		case IB_MGMT_METHOD_SET:
 		case IB_MGMT_METHOD_TRAP_REPRESS:
 			if (ibp->rvp.mkey_violations != 0xFFFF)
@@ -828,7 +828,7 @@
 	case IB_PORT_NOP:
 		if (lstate == 0)
 			break;
-		/* FALLTHROUGH */
+		fallthrough;
 	case IB_PORT_DOWN:
 		if (lstate == 0)
 			lstate = QIB_IB_LINKDOWN_ONLY;
@@ -1928,7 +1928,7 @@
 				ret = IB_MAD_RESULT_SUCCESS;
 				goto bail;
 			}
-			/* FALLTHROUGH */
+			fallthrough;
 		default:
 			smp->status |= IB_SMP_UNSUP_METH_ATTR;
 			ret = reply(smp);
@@ -1962,7 +1962,7 @@
 				ret = IB_MAD_RESULT_SUCCESS;
 				goto bail;
 			}
-			/* FALLTHROUGH */
+			fallthrough;
 		default:
 			smp->status |= IB_SMP_UNSUP_METH_ATTR;
 			ret = reply(smp);
@@ -2098,8 +2098,6 @@
 	struct ib_cc_classportinfo_attr *p =
 		(struct ib_cc_classportinfo_attr *)ccp->mgmt_data;
 
-	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
-
 	p->base_version = 1;
 	p->class_version = 1;
 	p->cap_mask = 0;
@@ -2120,8 +2118,6 @@
 	struct qib_ibport *ibp = to_iport(ibdev, port);
 	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
 
-	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
-
 	p->congestion_info = 0;
 	p->control_table_cap = ppd->cc_max_table_entries;
 
@@ -2138,8 +2134,6 @@
 	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
 	struct ib_cc_congestion_entry_shadow *entries;
 
-	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
-
 	spin_lock(&ppd->cc_shadow_lock);
 
 	entries = ppd->congestion_entries_shadow->entries;
@@ -2176,8 +2170,6 @@
 	if (cct_block_index > IB_CC_TABLE_CAP_DEFAULT - 1)
 		goto bail;
 
-	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
-
 	spin_lock(&ppd->cc_shadow_lock);
 
 	max_cct_block =
@@ -2296,92 +2288,55 @@
 	return reply_failure((struct ib_smp *) ccp);
 }
 
-static int check_cc_key(struct qib_ibport *ibp,
-			struct ib_cc_mad *ccp, int mad_flags)
-{
-	return 0;
-}
-
 static int process_cc(struct ib_device *ibdev, int mad_flags,
 			u8 port, const struct ib_mad *in_mad,
 			struct ib_mad *out_mad)
 {
 	struct ib_cc_mad *ccp = (struct ib_cc_mad *)out_mad;
-	struct qib_ibport *ibp = to_iport(ibdev, port);
-	int ret;
-
 	*out_mad = *in_mad;
 
 	if (ccp->class_version != 2) {
 		ccp->status |= IB_SMP_UNSUP_VERSION;
-		ret = reply((struct ib_smp *)ccp);
-		goto bail;
+		return reply((struct ib_smp *)ccp);
 	}
 
-	ret = check_cc_key(ibp, ccp, mad_flags);
-	if (ret)
-		goto bail;
-
 	switch (ccp->method) {
 	case IB_MGMT_METHOD_GET:
 		switch (ccp->attr_id) {
 		case IB_CC_ATTR_CLASSPORTINFO:
-			ret = cc_get_classportinfo(ccp, ibdev);
-			goto bail;
-
+			return cc_get_classportinfo(ccp, ibdev);
 		case IB_CC_ATTR_CONGESTION_INFO:
-			ret = cc_get_congestion_info(ccp, ibdev, port);
-			goto bail;
-
+			return cc_get_congestion_info(ccp, ibdev, port);
 		case IB_CC_ATTR_CA_CONGESTION_SETTING:
-			ret = cc_get_congestion_setting(ccp, ibdev, port);
-			goto bail;
-
+			return cc_get_congestion_setting(ccp, ibdev, port);
 		case IB_CC_ATTR_CONGESTION_CONTROL_TABLE:
-			ret = cc_get_congestion_control_table(ccp, ibdev, port);
-			goto bail;
-
-			/* FALLTHROUGH */
+			return cc_get_congestion_control_table(ccp, ibdev, port);
 		default:
 			ccp->status |= IB_SMP_UNSUP_METH_ATTR;
-			ret = reply((struct ib_smp *) ccp);
-			goto bail;
+			return reply((struct ib_smp *) ccp);
 		}
-
 	case IB_MGMT_METHOD_SET:
 		switch (ccp->attr_id) {
 		case IB_CC_ATTR_CA_CONGESTION_SETTING:
-			ret = cc_set_congestion_setting(ccp, ibdev, port);
-			goto bail;
-
+			return cc_set_congestion_setting(ccp, ibdev, port);
 		case IB_CC_ATTR_CONGESTION_CONTROL_TABLE:
-			ret = cc_set_congestion_control_table(ccp, ibdev, port);
-			goto bail;
-
-			/* FALLTHROUGH */
+			return cc_set_congestion_control_table(ccp, ibdev, port);
 		default:
 			ccp->status |= IB_SMP_UNSUP_METH_ATTR;
-			ret = reply((struct ib_smp *) ccp);
-			goto bail;
+			return reply((struct ib_smp *) ccp);
 		}
-
 	case IB_MGMT_METHOD_GET_RESP:
 		/*
 		 * The ib_mad module will call us to process responses
 		 * before checking for other consumers.
 		 * Just tell the caller to process it normally.
 		 */
-		ret = IB_MAD_RESULT_SUCCESS;
-		goto bail;
-
-	case IB_MGMT_METHOD_TRAP:
-	default:
-		ccp->status |= IB_SMP_UNSUP_METHOD;
-		ret = reply((struct ib_smp *) ccp);
+		return IB_MAD_RESULT_SUCCESS;
 	}
 
-bail:
-	return ret;
+	/* method is unsupported */
+	ccp->status |= IB_SMP_UNSUP_METHOD;
+	return reply((struct ib_smp *) ccp);
 }
 
 /**
@@ -2405,28 +2360,21 @@
  */
 int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
 		    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-		    const struct ib_mad_hdr *in, size_t in_mad_size,
-		    struct ib_mad_hdr *out, size_t *out_mad_size,
-		    u16 *out_mad_pkey_index)
+		    const struct ib_mad *in, struct ib_mad *out,
+		    size_t *out_mad_size, u16 *out_mad_pkey_index)
 {
 	int ret;
 	struct qib_ibport *ibp = to_iport(ibdev, port);
 	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
-	const struct ib_mad *in_mad = (const struct ib_mad *)in;
-	struct ib_mad *out_mad = (struct ib_mad *)out;
 
-	if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) ||
-			 *out_mad_size != sizeof(*out_mad)))
-		return IB_MAD_RESULT_FAILURE;
-
-	switch (in_mad->mad_hdr.mgmt_class) {
+	switch (in->mad_hdr.mgmt_class) {
 	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
 	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
-		ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad);
+		ret = process_subn(ibdev, mad_flags, port, in, out);
 		goto bail;
 
 	case IB_MGMT_CLASS_PERF_MGMT:
-		ret = process_perf(ibdev, port, in_mad, out_mad);
+		ret = process_perf(ibdev, port, in, out);
 		goto bail;
 
 	case IB_MGMT_CLASS_CONG_MGMT:
@@ -2435,7 +2383,7 @@
 			ret = IB_MAD_RESULT_SUCCESS;
 			goto bail;
 		}
-		ret = process_cc(ibdev, mad_flags, port, in_mad, out_mad);
+		ret = process_cc(ibdev, mad_flags, port, in, out);
 		goto bail;
 
 	default:
diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c
index 864f2af..3dc6ce0 100644
--- a/drivers/infiniband/hw/qib/qib_pcie.c
+++ b/drivers/infiniband/hw/qib/qib_pcie.c
@@ -145,7 +145,7 @@
 	addr = pci_resource_start(pdev, 0);
 	len = pci_resource_len(pdev, 0);
 
-	dd->kregbase = ioremap_nocache(addr, len);
+	dd->kregbase = ioremap(addr, len);
 	if (!dd->kregbase)
 		return -ENOMEM;
 
diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c
index aaf7438..3915e5b 100644
--- a/drivers/infiniband/hw/qib/qib_rc.c
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -83,7 +83,7 @@
 			rvt_put_mr(e->rdma_sge.mr);
 			e->rdma_sge.mr = NULL;
 		}
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(ATOMIC_ACKNOWLEDGE):
 		/*
 		 * We can increment the tail pointer now that the last
@@ -92,7 +92,7 @@
 		 */
 		if (++qp->s_tail_ack_queue > QIB_MAX_RDMA_ATOMIC)
 			qp->s_tail_ack_queue = 0;
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_ONLY):
 	case OP(ACKNOWLEDGE):
 		/* Check for no next entry in the queue. */
@@ -149,7 +149,7 @@
 
 	case OP(RDMA_READ_RESPONSE_FIRST):
 		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(RDMA_READ_RESPONSE_MIDDLE):
 		qp->s_cur_sge = &qp->s_ack_rdma_sge;
 		qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;
@@ -471,10 +471,10 @@
 		 * See qib_restart_rc().
 		 */
 		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_FIRST):
 		qp->s_state = OP(SEND_MIDDLE);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_MIDDLE):
 		bth2 = qp->s_psn++ & QIB_PSN_MASK;
 		ss = &qp->s_sge;
@@ -510,10 +510,10 @@
 		 * See qib_restart_rc().
 		 */
 		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(RDMA_WRITE_FIRST):
 		qp->s_state = OP(RDMA_WRITE_MIDDLE);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(RDMA_WRITE_MIDDLE):
 		bth2 = qp->s_psn++ & QIB_PSN_MASK;
 		ss = &qp->s_sge;
@@ -1807,7 +1807,7 @@
 		if (!ret)
 			goto rnr_nak;
 		qp->r_rcv_len = 0;
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_MIDDLE):
 	case OP(RDMA_WRITE_MIDDLE):
 send_middle:
@@ -1839,7 +1839,7 @@
 		qp->r_rcv_len = 0;
 		if (opcode == OP(SEND_ONLY))
 			goto no_immediate_data;
-		/* fall through -- for SEND_ONLY_WITH_IMMEDIATE */
+		fallthrough;	/* for SEND_ONLY_WITH_IMMEDIATE */
 	case OP(SEND_LAST_WITH_IMMEDIATE):
 send_last_imm:
 		wc.ex.imm_data = ohdr->u.imm_data;
diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c
index 99e11c3..5e86cbf 100644
--- a/drivers/infiniband/hw/qib/qib_sdma.c
+++ b/drivers/infiniband/hw/qib/qib_sdma.c
@@ -62,7 +62,7 @@
 static void sdma_put(struct qib_sdma_state *);
 static void sdma_set_state(struct qib_pportdata *, enum qib_sdma_states);
 static void sdma_start_sw_clean_up(struct qib_pportdata *);
-static void sdma_sw_clean_up_task(unsigned long);
+static void sdma_sw_clean_up_task(struct tasklet_struct *);
 static void unmap_desc(struct qib_pportdata *, unsigned);
 
 static void sdma_get(struct qib_sdma_state *ss)
@@ -119,9 +119,10 @@
 	}
 }
 
-static void sdma_sw_clean_up_task(unsigned long opaque)
+static void sdma_sw_clean_up_task(struct tasklet_struct *t)
 {
-	struct qib_pportdata *ppd = (struct qib_pportdata *) opaque;
+	struct qib_pportdata *ppd = from_tasklet(ppd, t,
+						 sdma_sw_clean_up_task);
 	unsigned long flags;
 
 	spin_lock_irqsave(&ppd->sdma_lock, flags);
@@ -436,8 +437,7 @@
 
 	INIT_LIST_HEAD(&ppd->sdma_activelist);
 
-	tasklet_init(&ppd->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
-		(unsigned long)ppd);
+	tasklet_setup(&ppd->sdma_sw_clean_up_task, sdma_sw_clean_up_task);
 
 	ret = dd->f_init_sdma_regs(ppd);
 	if (ret)
@@ -763,7 +763,7 @@
 			 * bringing the link up with traffic active on
 			 * 7220, e.g. */
 			ss->go_s99_running = 1;
-			/* fall through -- and start dma engine */
+			fallthrough;	/* and start dma engine */
 		case qib_sdma_event_e10_go_hw_start:
 			/* This reference means the state machine is started */
 			sdma_get(&ppd->sdma_state);
diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c
index e17b91e..554af42 100644
--- a/drivers/infiniband/hw/qib/qib_uc.c
+++ b/drivers/infiniband/hw/qib/qib_uc.c
@@ -161,7 +161,7 @@
 
 	case OP(SEND_FIRST):
 		qp->s_state = OP(SEND_MIDDLE);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_MIDDLE):
 		len = qp->s_len;
 		if (len > pmtu) {
@@ -185,7 +185,7 @@
 
 	case OP(RDMA_WRITE_FIRST):
 		qp->s_state = OP(RDMA_WRITE_MIDDLE);
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(RDMA_WRITE_MIDDLE):
 		len = qp->s_len;
 		if (len > pmtu) {
@@ -351,7 +351,7 @@
 			goto no_immediate_data;
 		else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
 			goto send_last_imm;
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(SEND_MIDDLE):
 		/* Check for invalid length PMTU or posted rwqe len. */
 		if (unlikely(tlen != (hdrsize + pmtu + 4)))
@@ -440,7 +440,7 @@
 			wc.ex.imm_data = ohdr->u.rc.imm_data;
 			goto rdma_last_imm;
 		}
-		/* FALLTHROUGH */
+		fallthrough;
 	case OP(RDMA_WRITE_MIDDLE):
 		/* Check for invalid length PMTU or posted rwqe len. */
 		if (unlikely(tlen != (hdrsize + pmtu + 4)))
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index 6bf764e..4c24e83 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -40,7 +40,7 @@
 static void __qib_release_user_pages(struct page **p, size_t num_pages,
 				     int dirty)
 {
-	put_user_pages_dirty_lock(p, num_pages, dirty);
+	unpin_user_pages_dirty_lock(p, num_pages, dirty);
 }
 
 /**
@@ -106,18 +106,18 @@
 		goto bail;
 	}
 
-	down_read(&current->mm->mmap_sem);
+	mmap_read_lock(current->mm);
 	for (got = 0; got < num_pages; got += ret) {
-		ret = get_user_pages(start_page + got * PAGE_SIZE,
+		ret = pin_user_pages(start_page + got * PAGE_SIZE,
 				     num_pages - got,
 				     FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE,
 				     p + got, NULL);
 		if (ret < 0) {
-			up_read(&current->mm->mmap_sem);
+			mmap_read_unlock(current->mm);
 			goto bail_release;
 		}
 	}
-	up_read(&current->mm->mmap_sem);
+	mmap_read_unlock(current->mm);
 
 	return 0;
 bail_release:
diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c
index 05190ed..bf2f30d 100644
--- a/drivers/infiniband/hw/qib/qib_user_sdma.c
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.c
@@ -317,7 +317,7 @@
 		 * the caller can ignore this page.
 		 */
 		if (put) {
-			put_user_page(page);
+			unpin_user_page(page);
 		} else {
 			/* coalesce case */
 			kunmap(page);
@@ -602,7 +602,7 @@
 /*
  * How many pages in this iovec element?
  */
-static int qib_user_sdma_num_pages(const struct iovec *iov)
+static size_t qib_user_sdma_num_pages(const struct iovec *iov)
 {
 	const unsigned long addr  = (unsigned long) iov->iov_base;
 	const unsigned long  len  = iov->iov_len;
@@ -631,7 +631,7 @@
 			kunmap(pkt->addr[i].page);
 
 		if (pkt->addr[i].put_page)
-			put_user_page(pkt->addr[i].page);
+			unpin_user_page(pkt->addr[i].page);
 		else
 			__free_page(pkt->addr[i].page);
 	} else if (pkt->addr[i].kvaddr) {
@@ -658,7 +658,7 @@
 static int qib_user_sdma_pin_pages(const struct qib_devdata *dd,
 				   struct qib_user_sdma_queue *pq,
 				   struct qib_user_sdma_pkt *pkt,
-				   unsigned long addr, int tlen, int npages)
+				   unsigned long addr, int tlen, size_t npages)
 {
 	struct page *pages[8];
 	int i, j;
@@ -670,7 +670,7 @@
 		else
 			j = npages;
 
-		ret = get_user_pages_fast(addr, j, FOLL_LONGTERM, pages);
+		ret = pin_user_pages_fast(addr, j, FOLL_LONGTERM, pages);
 		if (ret != j) {
 			i = 0;
 			j = ret;
@@ -706,7 +706,7 @@
 	/* if error, return all pages not managed by pkt */
 free_pages:
 	while (i < j)
-		put_user_page(pages[i++]);
+		unpin_user_page(pages[i++]);
 
 done:
 	return ret;
@@ -722,7 +722,7 @@
 	unsigned long idx;
 
 	for (idx = 0; idx < niov; idx++) {
-		const int npages = qib_user_sdma_num_pages(iov + idx);
+		const size_t npages = qib_user_sdma_num_pages(iov + idx);
 		const unsigned long addr = (unsigned long) iov[idx].iov_base;
 
 		ret = qib_user_sdma_pin_pages(dd, pq, pkt, addr,
@@ -824,8 +824,8 @@
 		unsigned pktnw;
 		unsigned pktnwc;
 		int nfrags = 0;
-		int npages = 0;
-		int bytes_togo = 0;
+		size_t npages = 0;
+		size_t bytes_togo = 0;
 		int tiddma = 0;
 		int cfur;
 
@@ -885,7 +885,11 @@
 
 			npages += qib_user_sdma_num_pages(&iov[idx]);
 
-			bytes_togo += slen;
+			if (check_add_overflow(bytes_togo, slen, &bytes_togo) ||
+			    bytes_togo > type_max(typeof(pkt->bytes_togo))) {
+				ret = -EINVAL;
+				goto free_pbc;
+			}
 			pktnwc += slen >> 2;
 			idx++;
 			nfrags++;
@@ -904,8 +908,7 @@
 		}
 
 		if (frag_size) {
-			int tidsmsize, n;
-			size_t pktsize;
+			size_t tidsmsize, n, pktsize, sz, addrlimit;
 
 			n = npages*((2*PAGE_SIZE/frag_size)+1);
 			pktsize = struct_size(pkt, addr, n);
@@ -923,14 +926,24 @@
 			else
 				tidsmsize = 0;
 
-			pkt = kmalloc(pktsize+tidsmsize, GFP_KERNEL);
+			if (check_add_overflow(pktsize, tidsmsize, &sz)) {
+				ret = -EINVAL;
+				goto free_pbc;
+			}
+			pkt = kmalloc(sz, GFP_KERNEL);
 			if (!pkt) {
 				ret = -ENOMEM;
 				goto free_pbc;
 			}
 			pkt->largepkt = 1;
 			pkt->frag_size = frag_size;
-			pkt->addrlimit = n + ARRAY_SIZE(pkt->addr);
+			if (check_add_overflow(n, ARRAY_SIZE(pkt->addr),
+					       &addrlimit) ||
+			    addrlimit > type_max(typeof(pkt->addrlimit))) {
+				ret = -EINVAL;
+				goto free_pkt;
+			}
+			pkt->addrlimit = addrlimit;
 
 			if (tiddma) {
 				char *tidsm = (char *)pkt + pktsize;
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index 5ef93f8..f6c01ba 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -39,7 +39,6 @@
 #include <linux/utsname.h>
 #include <linux/rculist.h>
 #include <linux/mm.h>
-#include <linux/random.h>
 #include <linux/vmalloc.h>
 #include <rdma/rdma_vt.h>
 
@@ -238,7 +237,7 @@
 	case IB_QPT_GSI:
 		if (ib_qib_disable_sma)
 			break;
-		/* FALLTHROUGH */
+		fallthrough;
 	case IB_QPT_UD:
 		qib_ud_rcv(ibp, hdr, has_grh, data, tlen, qp);
 		break;
@@ -1461,7 +1460,6 @@
 	rdi->dparms.props.max_cq = ib_qib_max_cqs;
 	rdi->dparms.props.max_cqe = ib_qib_max_cqes;
 	rdi->dparms.props.max_ah = ib_qib_max_ahs;
-	rdi->dparms.props.max_map_per_fmr = 32767;
 	rdi->dparms.props.max_qp_rd_atom = QIB_MAX_RDMA_ATOMIC;
 	rdi->dparms.props.max_qp_init_rd_atom = 255;
 	rdi->dparms.props.max_srq = ib_qib_max_srqs;
@@ -1503,7 +1501,6 @@
 	unsigned i, ctxt;
 	int ret;
 
-	get_random_bytes(&dev->qp_rnd, sizeof(dev->qp_rnd));
 	for (i = 0; i < dd->num_pports; i++)
 		init_ibport(ppd + i);
 
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
index 17bdf8a..dc0e81f 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.h
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -177,7 +177,6 @@
 	struct timer_list mem_timer;
 	struct qib_pio_header *pio_hdrs;
 	dma_addr_t pio_hdrs_phys;
-	u32 qp_rnd; /* random bytes for hash */
 
 	u32 n_piowait;
 	u32 n_txwait;
@@ -245,9 +244,8 @@
 void qib_node_desc_chg(struct qib_ibport *ibp);
 int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 		    const struct ib_wc *in_wc, const struct ib_grh *in_grh,
-		    const struct ib_mad_hdr *in, size_t in_mad_size,
-		    struct ib_mad_hdr *out, size_t *out_mad_size,
-		    u16 *out_mad_pkey_index);
+		    const struct ib_mad *in, struct ib_mad *out,
+		    size_t *out_mad_size, u16 *out_mad_pkey_index);
 void qib_notify_create_mad_agent(struct rvt_dev_info *rdi, int port_idx);
 void qib_notify_free_mad_agent(struct rvt_dev_info *rdi, int port_idx);
 
diff --git a/drivers/infiniband/hw/usnic/Kconfig b/drivers/infiniband/hw/usnic/Kconfig
index c0847d9..9019599 100644
--- a/drivers/infiniband/hw/usnic/Kconfig
+++ b/drivers/infiniband/hw/usnic/Kconfig
@@ -6,6 +6,6 @@
 	select ENIC
 	select NET_VENDOR_CISCO
 	select PCI_IOV
-	---help---
+	help
 	  This is a low-level driver for Cisco's Virtual Interface
 	  Cards (VICs), including the VIC 1240 and 1280 cards.
diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.c b/drivers/infiniband/hw/usnic/usnic_fwd.c
index 7875883..398c4c0 100644
--- a/drivers/infiniband/hw/usnic/usnic_fwd.c
+++ b/drivers/infiniband/hw/usnic/usnic_fwd.c
@@ -214,7 +214,7 @@
 	if (!flow)
 		return ERR_PTR(-ENOMEM);
 
-	tlv = pci_alloc_consistent(pdev, tlv_size, &tlv_pa);
+	tlv = dma_alloc_coherent(&pdev->dev, tlv_size, &tlv_pa, GFP_ATOMIC);
 	if (!tlv) {
 		usnic_err("Failed to allocate memory\n");
 		status = -ENOMEM;
@@ -258,7 +258,7 @@
 
 out_free_tlv:
 	spin_unlock(&ufdev->lock);
-	pci_free_consistent(pdev, tlv_size, tlv, tlv_pa);
+	dma_free_coherent(&pdev->dev, tlv_size, tlv, tlv_pa);
 	if (!status)
 		return flow;
 out_free_flow:
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
index c9abe1c..aa2e65f 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_main.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -120,7 +120,7 @@
 								IB_QPS_ERR,
 								NULL);
 				if (status) {
-					usnic_err("Failed to transistion qp grp %u from %s to %s\n",
+					usnic_err("Failed to transition qp grp %u from %s to %s\n",
 						qp_grp->grp_id,
 						usnic_ib_qp_grp_state_to_string
 						(cur_state),
@@ -315,7 +315,6 @@
 	if (err)
 		return err;
 
-	immutable->pkey_tbl_len = attr.pkey_tbl_len;
 	immutable->gid_tbl_len = attr.gid_tbl_len;
 
 	return 0;
@@ -355,7 +354,6 @@
 	.modify_qp = usnic_ib_modify_qp,
 	.query_device = usnic_ib_query_device,
 	.query_gid = usnic_ib_query_gid,
-	.query_pkey = usnic_ib_query_pkey,
 	.query_port = usnic_ib_query_port,
 	.query_qp = usnic_ib_query_qp,
 	.reg_user_mr = usnic_ib_reg_mr,
@@ -427,7 +425,8 @@
 	if (ret)
 		goto err_fwd_dealloc;
 
-	if (ib_register_device(&us_ibdev->ib_dev, "usnic_%d"))
+	dma_set_max_seg_size(&dev->dev, SZ_2G);
+	if (ib_register_device(&us_ibdev->ib_dev, "usnic_%d", &dev->dev))
 		goto err_fwd_dealloc;
 
 	usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu);
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
index a102a5d..6a2b7d1 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -325,7 +325,6 @@
 	props->max_mcast_grp = 0;
 	props->max_mcast_qp_attach = 0;
 	props->max_total_mcast_qp_attach = 0;
-	props->max_map_per_fmr = 0;
 	/* Owned by Userspace
 	 * max_qp_wr, max_sge, max_sge_rd, max_cqe */
 	mutex_unlock(&us_ibdev->usdev_lock);
@@ -371,7 +370,6 @@
 
 	props->port_cap_flags = 0;
 	props->gid_tbl_len = 1;
-	props->pkey_tbl_len = 1;
 	props->bad_pkey_cntr = 0;
 	props->qkey_viol_cntr = 0;
 	props->max_mtu = IB_MTU_4096;
@@ -441,16 +439,6 @@
 	return 0;
 }
 
-int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
-				u16 *pkey)
-{
-	if (index > 0)
-		return -EINVAL;
-
-	*pkey = 0xffff;
-	return 0;
-}
-
 int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
 	struct usnic_ib_pd *pd = to_upd(ibpd);
@@ -464,9 +452,10 @@
 	return 0;
 }
 
-void usnic_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+int usnic_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
 	usnic_uiom_dealloc_pd((to_upd(pd))->umem_pd);
+	return 0;
 }
 
 struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
@@ -507,7 +496,7 @@
 	if (init_attr->qp_type != IB_QPT_UD) {
 		usnic_err("%s asked to make a non-UD QP: %d\n",
 			  dev_name(&us_ibdev->ib_dev.dev), init_attr->qp_type);
-		return ERR_PTR(-EINVAL);
+		return ERR_PTR(-EOPNOTSUPP);
 	}
 
 	trans_spec = cmd.spec;
@@ -600,9 +589,9 @@
 	return 0;
 }
 
-void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
-	return;
+	return 0;
 }
 
 struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
index 2aedf78..11fe1ba 100644
--- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
@@ -48,10 +48,8 @@
 				struct ib_qp_init_attr *qp_init_attr);
 int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
 				union ib_gid *gid);
-int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
-				u16 *pkey);
 int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
-void usnic_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+int usnic_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
 					struct ib_qp_init_attr *init_attr,
 					struct ib_udata *udata);
@@ -60,7 +58,7 @@
 				int attr_mask, struct ib_udata *udata);
 int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		       struct ib_udata *udata);
-void usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
 				u64 virt_addr, int access_flags,
 				struct ib_udata *udata);
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 62e6ffa..760b254 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -75,7 +75,7 @@
 		for_each_sg(chunk->page_list, sg, chunk->nents, i) {
 			page = sg_page(sg);
 			pa = sg_phys(sg);
-			put_user_pages_dirty_lock(&page, 1, dirty);
+			unpin_user_pages_dirty_lock(&page, 1, dirty);
 			usnic_dbg("pa: %pa\n", &pa);
 		}
 		kfree(chunk);
@@ -123,7 +123,7 @@
 	npages = PAGE_ALIGN(size + (addr & ~PAGE_MASK)) >> PAGE_SHIFT;
 
 	uiomr->owning_mm = mm = current->mm;
-	down_read(&mm->mmap_sem);
+	mmap_read_lock(mm);
 
 	locked = atomic64_add_return(npages, &current->mm->pinned_vm);
 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
@@ -141,7 +141,7 @@
 	ret = 0;
 
 	while (npages) {
-		ret = get_user_pages(cur_base,
+		ret = pin_user_pages(cur_base,
 				     min_t(unsigned long, npages,
 				     PAGE_SIZE / sizeof(struct page *)),
 				     gup_flags | FOLL_LONGTERM,
@@ -187,7 +187,7 @@
 	} else
 		mmgrab(uiomr->owning_mm);
 
-	up_read(&mm->mmap_sem);
+	mmap_read_unlock(mm);
 	free_page((unsigned long) page_list);
 	return ret;
 }
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h
index 70be49b..7ec8991 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.h
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.h
@@ -77,7 +77,7 @@
 struct usnic_uiom_chunk {
 	struct list_head		list;
 	int				nents;
-	struct scatterlist		page_list[0];
+	struct scatterlist		page_list[];
 };
 
 struct usnic_uiom_pd *usnic_uiom_alloc_pd(void);
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
index d399523..29d7126 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
@@ -83,7 +83,8 @@
 	return interval;
 }
 
-static int interval_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int interval_cmp(void *priv, const struct list_head *a,
+			const struct list_head *b)
 {
 	struct usnic_uiom_interval_node *node_a, *node_b;
 
diff --git a/drivers/infiniband/hw/vmw_pvrdma/Kconfig b/drivers/infiniband/hw/vmw_pvrdma/Kconfig
index b99c9f0..a11e892 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/Kconfig
+++ b/drivers/infiniband/hw/vmw_pvrdma/Kconfig
@@ -2,7 +2,7 @@
 config INFINIBAND_VMWARE_PVRDMA
 	tristate "VMware Paravirtualized RDMA Driver"
 	depends on NETDEVICES && ETHERNET && PCI && INET && VMXNET3
-	---help---
+	help
 	  This driver provides low-level support for VMware Paravirtual
 	  RDMA adapter. It interacts with the VMXNet3 driver to provide
 	  Ethernet capabilities.
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
index c142f5e..de57f2f 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h
@@ -509,6 +509,20 @@
 	return flags & PVRDMA_MASK(PVRDMA_SEND_FLAGS_MAX);
 }
 
+static inline int pvrdma_network_type_to_ib(enum pvrdma_network_type type)
+{
+	switch (type) {
+	case PVRDMA_NETWORK_ROCE_V1:
+		return RDMA_NETWORK_ROCE_V1;
+	case PVRDMA_NETWORK_IPV4:
+		return RDMA_NETWORK_IPV4;
+	case PVRDMA_NETWORK_IPV6:
+		return RDMA_NETWORK_IPV6;
+	default:
+		return RDMA_NETWORK_IPV6;
+	}
+}
+
 void pvrdma_qp_cap_to_ib(struct ib_qp_cap *dst,
 			 const struct pvrdma_qp_cap *src);
 void ib_qp_cap_to_pvrdma(struct pvrdma_qp_cap *dst,
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
index 7800e69..62164db 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
@@ -135,14 +135,14 @@
 			goto err_cq;
 		}
 
-		cq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size,
-				       IB_ACCESS_LOCAL_WRITE, 1);
+		cq->umem = ib_umem_get(ibdev, ucmd.buf_addr, ucmd.buf_size,
+				       IB_ACCESS_LOCAL_WRITE);
 		if (IS_ERR(cq->umem)) {
 			ret = PTR_ERR(cq->umem);
 			goto err_cq;
 		}
 
-		npages = ib_umem_page_count(cq->umem);
+		npages = ib_umem_num_dma_blocks(cq->umem, PAGE_SIZE);
 	} else {
 		/* One extra page for shared ring state */
 		npages = 1 + (entries * sizeof(struct pvrdma_cqe) +
@@ -235,7 +235,7 @@
  * @cq: the completion queue to destroy.
  * @udata: user data or null for kernel object
  */
-void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
+int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
 	struct pvrdma_cq *vcq = to_vcq(cq);
 	union pvrdma_cmd_req req;
@@ -261,6 +261,7 @@
 
 	pvrdma_free_cq(dev, vcq);
 	atomic_dec(&dev->num_cqs);
+	return 0;
 }
 
 static inline struct pvrdma_cqe *get_cqe(struct pvrdma_cq *cq, int i)
@@ -363,7 +364,7 @@
 	wc->dlid_path_bits = cqe->dlid_path_bits;
 	wc->port_num = cqe->port_num;
 	wc->vendor_err = cqe->vendor_err;
-	wc->network_hdr_type = cqe->network_hdr_type;
+	wc->network_hdr_type = pvrdma_network_type_to_ib(cqe->network_hdr_type);
 
 	/* Update shared ring state */
 	pvrdma_idx_ring_inc(&cq->ring_state->rx.cons_head, cq->ibcq.cqe);
@@ -375,7 +376,7 @@
  * pvrdma_poll_cq - poll for work completion queue entries
  * @ibcq: completion queue
  * @num_entries: the maximum number of entries
- * @entry: pointer to work completion array
+ * @wc: pointer to work completion array
  *
  * @return: number of polled completion entries
  */
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
index 8f9749d..86a6c05 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h
@@ -58,7 +58,8 @@
 #define PVRDMA_ROCEV1_VERSION		17
 #define PVRDMA_ROCEV2_VERSION		18
 #define PVRDMA_PPN64_VERSION		19
-#define PVRDMA_VERSION			PVRDMA_PPN64_VERSION
+#define PVRDMA_QPHANDLE_VERSION		20
+#define PVRDMA_VERSION			PVRDMA_QPHANDLE_VERSION
 
 #define PVRDMA_BOARD_ID			1
 #define PVRDMA_REV_ID			1
@@ -581,6 +582,17 @@
 	u32 max_inline_data;
 };
 
+struct pvrdma_cmd_create_qp_resp_v2 {
+	struct pvrdma_cmd_resp_hdr hdr;
+	u32 qpn;
+	u32 qp_handle;
+	u32 max_send_wr;
+	u32 max_recv_wr;
+	u32 max_send_sge;
+	u32 max_recv_sge;
+	u32 max_inline_data;
+};
+
 struct pvrdma_cmd_modify_qp {
 	struct pvrdma_cmd_hdr hdr;
 	u32 qp_handle;
@@ -663,6 +675,7 @@
 	struct pvrdma_cmd_create_cq_resp create_cq_resp;
 	struct pvrdma_cmd_resize_cq_resp resize_cq_resp;
 	struct pvrdma_cmd_create_qp_resp create_qp_resp;
+	struct pvrdma_cmd_create_qp_resp_v2 create_qp_resp_v2;
 	struct pvrdma_cmd_query_qp_resp query_qp_resp;
 	struct pvrdma_cmd_destroy_qp_resp destroy_qp_resp;
 	struct pvrdma_cmd_create_srq_resp create_srq_resp;
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
index 10e6728..6895bac 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
@@ -270,7 +270,7 @@
 	spin_lock_init(&dev->srq_tbl_lock);
 	rdma_set_device_sysfs_group(&dev->ib_dev, &pvrdma_attr_group);
 
-	ret = ib_register_device(&dev->ib_dev, "vmw_pvrdma%d");
+	ret = ib_register_device(&dev->ib_dev, "vmw_pvrdma%d", &dev->pdev->dev);
 	if (ret)
 		goto err_srq_free;
 
@@ -854,7 +854,7 @@
 			goto err_free_resource;
 		}
 	}
-
+	dma_set_max_seg_size(&pdev->dev, UINT_MAX);
 	pci_set_master(pdev);
 
 	/* Map register space */
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c
index 7944c58..ba43ad0 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_misc.c
@@ -182,17 +182,16 @@
 int pvrdma_page_dir_insert_umem(struct pvrdma_page_dir *pdir,
 				struct ib_umem *umem, u64 offset)
 {
+	struct ib_block_iter biter;
 	u64 i = offset;
 	int ret = 0;
-	struct sg_dma_page_iter sg_iter;
 
 	if (offset >= pdir->npages)
 		return -EINVAL;
 
-	for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
-		dma_addr_t addr = sg_page_iter_dma_address(&sg_iter);
-
-		ret = pvrdma_page_dir_insert_dma(pdir, i, addr);
+	rdma_umem_for_each_dma_block (umem, &biter, PAGE_SIZE) {
+		ret = pvrdma_page_dir_insert_dma(
+			pdir, i, rdma_block_iter_dma_address(&biter));
 		if (ret)
 			goto exit;
 
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
index f3a3d22..e80848b 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
@@ -126,14 +126,14 @@
 		return ERR_PTR(-EINVAL);
 	}
 
-	umem = ib_umem_get(udata, start, length, access_flags, 0);
+	umem = ib_umem_get(pd->device, start, length, access_flags);
 	if (IS_ERR(umem)) {
 		dev_warn(&dev->pdev->dev,
 			 "could not get umem for mem region\n");
 		return ERR_CAST(umem);
 	}
 
-	npages = ib_umem_num_pages(umem);
+	npages = ib_umem_num_dma_blocks(umem, PAGE_SIZE);
 	if (npages < 0 || npages > PVRDMA_PAGE_DIR_MAX_PAGES) {
 		dev_warn(&dev->pdev->dev, "overflow %d pages in mem region\n",
 			 npages);
@@ -202,7 +202,7 @@
  * @return: ib_mr pointer on success, otherwise returns an errno.
  */
 struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			      u32 max_num_sg, struct ib_udata *udata)
+			      u32 max_num_sg)
 {
 	struct pvrdma_dev *dev = to_vdev(pd->device);
 	struct pvrdma_user_mr *mr;
@@ -270,6 +270,7 @@
 /**
  * pvrdma_dereg_mr - deregister a memory region
  * @ibmr: memory region
+ * @udata: pointer to user data
  *
  * @return: 0 on success.
  */
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
index bca6a58..428256c 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
@@ -52,6 +52,9 @@
 
 #include "pvrdma.h"
 
+static void __pvrdma_destroy_qp(struct pvrdma_dev *dev,
+				struct pvrdma_qp *qp);
+
 static inline void get_cqs(struct pvrdma_qp *qp, struct pvrdma_cq **send_cq,
 			   struct pvrdma_cq **recv_cq)
 {
@@ -195,7 +198,9 @@
 	union pvrdma_cmd_resp rsp;
 	struct pvrdma_cmd_create_qp *cmd = &req.create_qp;
 	struct pvrdma_cmd_create_qp_resp *resp = &rsp.create_qp_resp;
+	struct pvrdma_cmd_create_qp_resp_v2 *resp_v2 = &rsp.create_qp_resp_v2;
 	struct pvrdma_create_qp ucmd;
+	struct pvrdma_create_qp_resp qp_resp = {};
 	unsigned long flags;
 	int ret;
 	bool is_srq = !!init_attr->srq;
@@ -212,7 +217,7 @@
 	    init_attr->qp_type != IB_QPT_GSI) {
 		dev_warn(&dev->pdev->dev, "queuepair type %d not supported\n",
 			 init_attr->qp_type);
-		return ERR_PTR(-EINVAL);
+		return ERR_PTR(-EOPNOTSUPP);
 	}
 
 	if (is_srq && !dev->dsr->caps.max_srq) {
@@ -227,13 +232,12 @@
 	switch (init_attr->qp_type) {
 	case IB_QPT_GSI:
 		if (init_attr->port_num == 0 ||
-		    init_attr->port_num > pd->device->phys_port_cnt ||
-		    udata) {
+		    init_attr->port_num > pd->device->phys_port_cnt) {
 			dev_warn(&dev->pdev->dev, "invalid queuepair attrs\n");
 			ret = -EINVAL;
 			goto err_qp;
 		}
-		/* fall through */
+		fallthrough;
 	case IB_QPT_RC:
 	case IB_QPT_UD:
 		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
@@ -260,10 +264,20 @@
 				goto err_qp;
 			}
 
+			/* Userspace supports qpn and qp handles? */
+			if (dev->dsr_version >= PVRDMA_QPHANDLE_VERSION &&
+			    udata->outlen < sizeof(qp_resp)) {
+				dev_warn(&dev->pdev->dev,
+					 "create queuepair not supported\n");
+				ret = -EOPNOTSUPP;
+				goto err_qp;
+			}
+
 			if (!is_srq) {
 				/* set qp->sq.wqe_cnt, shift, buf_size.. */
-				qp->rumem = ib_umem_get(udata, ucmd.rbuf_addr,
-							ucmd.rbuf_size, 0, 0);
+				qp->rumem =
+					ib_umem_get(pd->device, ucmd.rbuf_addr,
+						    ucmd.rbuf_size, 0);
 				if (IS_ERR(qp->rumem)) {
 					ret = PTR_ERR(qp->rumem);
 					goto err_qp;
@@ -274,8 +288,8 @@
 				qp->srq = to_vsrq(init_attr->srq);
 			}
 
-			qp->sumem = ib_umem_get(udata, ucmd.sbuf_addr,
-						ucmd.sbuf_size, 0, 0);
+			qp->sumem = ib_umem_get(pd->device, ucmd.sbuf_addr,
+						ucmd.sbuf_size, 0);
 			if (IS_ERR(qp->sumem)) {
 				if (!is_srq)
 					ib_umem_release(qp->rumem);
@@ -283,9 +297,11 @@
 				goto err_qp;
 			}
 
-			qp->npages_send = ib_umem_page_count(qp->sumem);
+			qp->npages_send =
+				ib_umem_num_dma_blocks(qp->sumem, PAGE_SIZE);
 			if (!is_srq)
-				qp->npages_recv = ib_umem_page_count(qp->rumem);
+				qp->npages_recv = ib_umem_num_dma_blocks(
+					qp->rumem, PAGE_SIZE);
 			else
 				qp->npages_recv = 0;
 			qp->npages = qp->npages_send + qp->npages_recv;
@@ -379,13 +395,33 @@
 	}
 
 	/* max_send_wr/_recv_wr/_send_sge/_recv_sge/_inline_data */
-	qp->qp_handle = resp->qpn;
 	qp->port = init_attr->port_num;
-	qp->ibqp.qp_num = resp->qpn;
+
+	if (dev->dsr_version >= PVRDMA_QPHANDLE_VERSION) {
+		qp->ibqp.qp_num = resp_v2->qpn;
+		qp->qp_handle = resp_v2->qp_handle;
+	} else {
+		qp->ibqp.qp_num = resp->qpn;
+		qp->qp_handle = resp->qpn;
+	}
+
 	spin_lock_irqsave(&dev->qp_tbl_lock, flags);
 	dev->qp_tbl[qp->qp_handle % dev->dsr->caps.max_qp] = qp;
 	spin_unlock_irqrestore(&dev->qp_tbl_lock, flags);
 
+	if (udata) {
+		qp_resp.qpn = qp->ibqp.qp_num;
+		qp_resp.qp_handle = qp->qp_handle;
+
+		if (ib_copy_to_udata(udata, &qp_resp,
+				     min(udata->outlen, sizeof(qp_resp)))) {
+			dev_warn(&dev->pdev->dev,
+				 "failed to copy back udata\n");
+			__pvrdma_destroy_qp(dev, qp);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
 	return &qp->ibqp;
 
 err_pdir:
@@ -400,27 +436,15 @@
 	return ERR_PTR(ret);
 }
 
-static void pvrdma_free_qp(struct pvrdma_qp *qp)
+static void _pvrdma_free_qp(struct pvrdma_qp *qp)
 {
+	unsigned long flags;
 	struct pvrdma_dev *dev = to_vdev(qp->ibqp.device);
-	struct pvrdma_cq *scq;
-	struct pvrdma_cq *rcq;
-	unsigned long flags, scq_flags, rcq_flags;
-
-	/* In case cq is polling */
-	get_cqs(qp, &scq, &rcq);
-	pvrdma_lock_cqs(scq, rcq, &scq_flags, &rcq_flags);
-
-	_pvrdma_flush_cqe(qp, scq);
-	if (scq != rcq)
-		_pvrdma_flush_cqe(qp, rcq);
 
 	spin_lock_irqsave(&dev->qp_tbl_lock, flags);
 	dev->qp_tbl[qp->qp_handle] = NULL;
 	spin_unlock_irqrestore(&dev->qp_tbl_lock, flags);
 
-	pvrdma_unlock_cqs(scq, rcq, &scq_flags, &rcq_flags);
-
 	if (refcount_dec_and_test(&qp->refcnt))
 		complete(&qp->free);
 	wait_for_completion(&qp->free);
@@ -435,34 +459,71 @@
 	atomic_dec(&dev->num_qps);
 }
 
-/**
- * pvrdma_destroy_qp - destroy a queue pair
- * @qp: the queue pair to destroy
- * @udata: user data or null for kernel object
- *
- * @return: 0 on success.
- */
-int pvrdma_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
+static void pvrdma_free_qp(struct pvrdma_qp *qp)
 {
-	struct pvrdma_qp *vqp = to_vqp(qp);
+	struct pvrdma_cq *scq;
+	struct pvrdma_cq *rcq;
+	unsigned long scq_flags, rcq_flags;
+
+	/* In case cq is polling */
+	get_cqs(qp, &scq, &rcq);
+	pvrdma_lock_cqs(scq, rcq, &scq_flags, &rcq_flags);
+
+	_pvrdma_flush_cqe(qp, scq);
+	if (scq != rcq)
+		_pvrdma_flush_cqe(qp, rcq);
+
+	/*
+	 * We're now unlocking the CQs before clearing out the qp handle this
+	 * should still be safe. We have destroyed the backend QP and flushed
+	 * the CQEs so there should be no other completions for this QP.
+	 */
+	pvrdma_unlock_cqs(scq, rcq, &scq_flags, &rcq_flags);
+
+	_pvrdma_free_qp(qp);
+}
+
+static inline void _pvrdma_destroy_qp_work(struct pvrdma_dev *dev,
+					   u32 qp_handle)
+{
 	union pvrdma_cmd_req req;
 	struct pvrdma_cmd_destroy_qp *cmd = &req.destroy_qp;
 	int ret;
 
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->hdr.cmd = PVRDMA_CMD_DESTROY_QP;
-	cmd->qp_handle = vqp->qp_handle;
+	cmd->qp_handle = qp_handle;
 
-	ret = pvrdma_cmd_post(to_vdev(qp->device), &req, NULL, 0);
+	ret = pvrdma_cmd_post(dev, &req, NULL, 0);
 	if (ret < 0)
-		dev_warn(&to_vdev(qp->device)->pdev->dev,
+		dev_warn(&dev->pdev->dev,
 			 "destroy queuepair failed, error: %d\n", ret);
+}
 
+/**
+ * pvrdma_destroy_qp - destroy a queue pair
+ * @qp: the queue pair to destroy
+ * @udata: user data or null for kernel object
+ *
+ * @return: always 0.
+ */
+int pvrdma_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
+{
+	struct pvrdma_qp *vqp = to_vqp(qp);
+
+	_pvrdma_destroy_qp_work(to_vdev(qp->device), vqp->qp_handle);
 	pvrdma_free_qp(vqp);
 
 	return 0;
 }
 
+static void __pvrdma_destroy_qp(struct pvrdma_dev *dev,
+				struct pvrdma_qp *qp)
+{
+	_pvrdma_destroy_qp_work(dev, qp->qp_handle);
+	_pvrdma_free_qp(qp);
+}
+
 /**
  * pvrdma_modify_qp - modify queue pair attributes
  * @ibqp: the queue pair
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
index 36cdfbd..082208f 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
@@ -90,7 +90,7 @@
 
 /**
  * pvrdma_create_srq - create shared receive queue
- * @pd: protection domain
+ * @ibsrq: the IB shared receive queue
  * @init_attr: shared receive queue attributes
  * @udata: user data
  *
@@ -146,13 +146,13 @@
 		goto err_srq;
 	}
 
-	srq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size, 0, 0);
+	srq->umem = ib_umem_get(ibsrq->device, ucmd.buf_addr, ucmd.buf_size, 0);
 	if (IS_ERR(srq->umem)) {
 		ret = PTR_ERR(srq->umem);
 		goto err_srq;
 	}
 
-	srq->npages = ib_umem_page_count(srq->umem);
+	srq->npages = ib_umem_num_dma_blocks(srq->umem, PAGE_SIZE);
 
 	if (srq->npages < 0 || srq->npages > PVRDMA_PAGE_DIR_MAX_PAGES) {
 		dev_warn(&dev->pdev->dev,
@@ -240,7 +240,7 @@
  *
  * @return: 0 for success.
  */
-void pvrdma_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
+int pvrdma_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
 {
 	struct pvrdma_srq *vsrq = to_vsrq(srq);
 	union pvrdma_cmd_req req;
@@ -259,6 +259,7 @@
 			 ret);
 
 	pvrdma_free_srq(dev, vsrq);
+	return 0;
 }
 
 /**
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
index faf7ecd..fc412cb 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
@@ -479,9 +479,9 @@
  * @pd: the protection domain to be released
  * @udata: user data or null for kernel object
  *
- * @return: 0 on success, otherwise errno.
+ * @return: Always 0
  */
-void pvrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
+int pvrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
 	struct pvrdma_dev *dev = to_vdev(pd->device);
 	union pvrdma_cmd_req req = {};
@@ -498,20 +498,21 @@
 			 ret);
 
 	atomic_dec(&dev->num_pds);
+	return 0;
 }
 
 /**
  * pvrdma_create_ah - create an address handle
- * @pd: the protection domain
- * @ah_attr: the attributes of the AH
- * @udata: user data blob
- * @flags: create address handle flags (see enum rdma_create_ah_flags)
+ * @ibah: the IB address handle
+ * @init_attr: the attributes of the AH
+ * @udata: pointer to user data
  *
  * @return: 0 on success, otherwise errno.
  */
-int pvrdma_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
-		     u32 flags, struct ib_udata *udata)
+int pvrdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr,
+		     struct ib_udata *udata)
 {
+	struct rdma_ah_attr *ah_attr = init_attr->ah_attr;
 	struct pvrdma_dev *dev = to_vdev(ibah->device);
 	struct pvrdma_ah *ah = to_vah(ibah);
 	const struct ib_global_route *grh;
@@ -547,9 +548,10 @@
  * @flags: destroy address handle flags (see enum rdma_destroy_ah_flags)
  *
  */
-void pvrdma_destroy_ah(struct ib_ah *ah, u32 flags)
+int pvrdma_destroy_ah(struct ib_ah *ah, u32 flags)
 {
 	struct pvrdma_dev *dev = to_vdev(ah->device);
 
 	atomic_dec(&dev->num_ahs);
+	return 0;
 }
diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
index e4a48f5..97ed8f9 100644
--- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
+++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
@@ -399,31 +399,31 @@
 int pvrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
 void pvrdma_dealloc_ucontext(struct ib_ucontext *context);
 int pvrdma_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
-void pvrdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+int pvrdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
 struct ib_mr *pvrdma_get_dma_mr(struct ib_pd *pd, int acc);
 struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 				 u64 virt_addr, int access_flags,
 				 struct ib_udata *udata);
 int pvrdma_dereg_mr(struct ib_mr *mr, struct ib_udata *udata);
 struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-			      u32 max_num_sg, struct ib_udata *udata);
+			      u32 max_num_sg);
 int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
 		     int sg_nents, unsigned int *sg_offset);
 int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
 		     struct ib_udata *udata);
-void pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
+int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
-int pvrdma_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
+int pvrdma_create_ah(struct ib_ah *ah, struct rdma_ah_init_attr *init_attr,
 		     struct ib_udata *udata);
-void pvrdma_destroy_ah(struct ib_ah *ah, u32 flags);
+int pvrdma_destroy_ah(struct ib_ah *ah, u32 flags);
 
 int pvrdma_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *init_attr,
 		      struct ib_udata *udata);
 int pvrdma_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 		      enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
 int pvrdma_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
-void pvrdma_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
+int pvrdma_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
 
 struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
 			       struct ib_qp_init_attr *init_attr,